In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load the dataset
data = pd.read_csv('C:/Users/user/Desktop/MSc Folder/Graduate Applications/Flood Data/Flood_Data1.csv')

# Inspect the first few rows
print(data.head())

   YEAR  STATE  FLOOD OCCURENCE     RH  MAX_TEMP  MIN_TEMP  PRECIPITATION
0  1990  BENUE                0  77.38     35.99     14.08        1165.43
1  1990    FCT                0  74.44     38.61     13.43        1270.90
2  1990   KOGI                0  77.50     36.12     16.42         949.22
3  1990  KWARA                0  68.44     39.73     15.85         849.02
4  1990   NASS                0  74.12     38.76     14.87        1244.53


In [5]:
# Define the features (X) and target (y)
X = data.drop(columns=['FLOOD OCCURENCE'])  # All independent variables
y = data['FLOOD OCCURENCE']  # Dependent variable

# If 'State' is categorical, you need to encode it
X = pd.get_dummies(X, columns=['STATE'], drop_first=True)

# Check the processed data
print(X.head())

   YEAR     RH  MAX_TEMP  MIN_TEMP  PRECIPITATION  STATE_FCT  STATE_KOGI  \
0  1990  77.38     35.99     14.08        1165.43      False       False   
1  1990  74.44     38.61     13.43        1270.90       True       False   
2  1990  77.50     36.12     16.42         949.22      False        True   
3  1990  68.44     39.73     15.85         849.02      False       False   
4  1990  74.12     38.76     14.87        1244.53      False       False   

   STATE_KWARA  STATE_NASS  STATE_NIGER  STATE_PLATEAU  
0        False       False        False          False  
1        False       False        False          False  
2        False       False        False          False  
3         True       False        False          False  
4        False        True        False          False  


In [6]:
# Retain 'Year' for trend analysis
year_column = data['YEAR']

#Spliting Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

#Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)

# Define the hyperparameters and their respective values to be tested
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test accuracy: {:.2f}".format(test_accuracy))

Best parameters found:  {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation accuracy: 0.83
Test accuracy: 0.84


In [7]:
# y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error on test set: {mae}')

NameError: name 'mean_absolute_error' is not defined

In [None]:
# Predict on the test data
y_pred = best_model.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R²)
r2 = r2_score(y_test, y_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

# Print the results
print(f"MAPE: {mape:.4f}")
print(f"MSE: {mse:.4f}")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

In [None]:
# Plot feature importances
importances = best_model.feature_importances_
features = X.columns
indices = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Importance')
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score

# Compute metrics
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"Root Mean Square Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² (Goodness of Fit): {r2:.2f}")

In [None]:
# Predict on the test data
y_pred = best_model.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R²)
r2 = r2_score(y_test, y_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

# Print the results
print(f"MAPE: {mape:.4f}")
print(f"MSE: {mse:.4f}")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

In [None]:
# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Finding the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
# Combine predictions with Year for trend analysis
df_test = pd.DataFrame(X_test, columns=X.columns)  # Recreate DataFrame for test data
df_test['Year'] = year_test.reset_index(drop=True)  # Add Year back to the test DataFrame
df_test['Flood_Prediction'] = y_pred

# Group by Year and calculate mean prediction
yearly_trend = df_test.groupby('Year')['Flood_Prediction'].mean().reset_index()

# Plot the bar chart of predicted flood occurrences over time
plt.figure(figsize=(10, 6))
sns.barplot(data=yearly_trend, x='Year', y='Flood_Prediction', color='skyblue')
plt.title('Trend of Predicted Flood Occurrences Over Time')
plt.xlabel('Year')
plt.ylabel('Predicted Flood Occurrences')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.grid(axis='y')
plt.show()

In [None]:
# Create a DataFrame for the count of predicted floods and non-floods
yearly_flood_counts = df_test.groupby('Year')['Flood_Prediction'].value_counts().unstack().fillna(0).reset_index()
yearly_flood_counts.columns = ['Year', 'No Flood', 'Flood']

# Plotting the stacked bar chart
plt.figure(figsize=(12, 7))
yearly_flood_counts.set_index('Year').plot(kind='bar', stacked=True, color=['lightgrey', 'skyblue'])
#plt.title('Stacked Bar Plot of Predicted Flood Occurrences vs. Non-Occurrences Over Time')
plt.title('Trend Analysis for Flood Predictions')
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotating x-axis labels for better readability
plt.legend(title='Prediction')
plt.grid(axis='y')
plt.show()

In [None]:
# Create a pivot table for heatmap
heatmap_data = df_test.groupby(['Year', 'Flood_Prediction']).size().unstack().fillna(0)

# Plot heatmap
plt.figure(figsize=(12, 7))
sns.heatmap(heatmap_data, cmap='Blues', annot=True, cbar_kws={'label': 'Count'})
plt.title('Heatmap of Predicted Flood Occurrences and Non-Occurrences Over Time')
plt.xlabel('Flood Prediction')
plt.ylabel('Year')
plt.xticks(ticks=[0.5, 1.5], labels=['No Flood', 'Flood'])
plt.yticks(rotation=0)
plt.show()

In [None]:
# Create a DataFrame for the count of predicted floods and non-floods
yearly_flood_counts = df_test.groupby('Year')['Flood_Prediction'].value_counts().unstack().fillna(0).reset_index()
yearly_flood_counts.columns = ['Year', 'No Flood', 'Flood']

# Plot the area plot
plt.figure(figsize=(12, 7))
yearly_flood_counts.set_index('Year').plot(kind='area', stacked=True, alpha=0.5, color=['lightgrey', 'skyblue'])
plt.title('Area Plot of Predicted Flood Occurrences and Non-Occurrences Over Time')
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.grid(True, axis='y')
plt.show()

In [None]:
# Prepare data for stacked line plot
yearly_flood_counts = df_test.groupby(['Year', 'Flood_Prediction']).size().unstack().fillna(0).cumsum().reset_index()

# Plot stacked line plot
plt.figure(figsize=(12, 7))
plt.plot(yearly_flood_counts['Year'], yearly_flood_counts[0], label='No Flood', color='lightgrey')
plt.plot(yearly_flood_counts['Year'], yearly_flood_counts[1], label='Flood', color='skyblue')
plt.fill_between(yearly_flood_counts['Year'], yearly_flood_counts[0], color='lightgrey', alpha=0.5)
plt.fill_between(yearly_flood_counts['Year'], yearly_flood_counts[1], color='skyblue', alpha=0.5)
plt.title('Stacked Line Plot of Predicted Flood Occurrences and Non-Occurrences Over Time')
plt.xlabel('Year')
plt.ylabel('Cumulative Count')
plt.legend()
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.grid(True)
plt.show()

In [None]:
# Assume y_true and y_pred are your true labels and predicted labels respectively
cm = confusion_matrix(y_test, y_pred)

# Extract True Positives (TP) and False Negatives (FN)
TP = cm[1, 1]
FN = cm[1, 0]

# Calculate Sensitivity
sensitivity = TP / (TP + FN)
print(f"Sensitivity: {sensitivity:.2f}")

In [None]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

# Compute Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)

# Plot Precision-Recall curve
plt.figure(figsize=(10, 7))
plt.plot(recall, precision, color='blue', label='Precision-Recall curve')
plt.xlabel('Recall (Sensitivity)')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import roc_curve

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Plot Sensitivity (TPR) vs. (1 - Specificity)
plt.figure(figsize=(10, 7))
plt.plot(thresholds, tpr, color='blue', label='Sensitivity')
plt.plot(thresholds, 1-fpr, color='red', label='Specificity')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Sensitivity and Specificity vs. Threshold')
plt.legend(loc='best')
plt.grid(True)
plt.show()

In [None]:
# Plotting
plt.figure(figsize=(10, 5))
plt.plot(y_test, label='Actual Values', marker='o')
plt.plot(y_pred, label='Predicted Values', marker='x')
plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.show()