In [3]:
#Importing 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier

In [4]:
# Load the dataset
data = pd.read_csv('C:/Users/user/Desktop/MSc Folder/Graduate Applications/Flood Data/Flood_Data1.csv')

# Inspect the first few rows
print(data.head())

   YEAR  STATE  FLOOD OCCURENCE     RH  MAX_TEMP  MIN_TEMP  PRECIPITATION
0  1990  BENUE                0  77.38     35.99     14.08        1165.43
1  1990    FCT                0  74.44     38.61     13.43        1270.90
2  1990   KOGI                0  77.50     36.12     16.42         949.22
3  1990  KWARA                0  68.44     39.73     15.85         849.02
4  1990   NASS                0  74.12     38.76     14.87        1244.53


In [5]:
# Define the features (X) and target (y)
X = data.drop(columns=['FLOOD OCCURENCE'])  # All independent variables
y = data['FLOOD OCCURENCE']  # Dependent variable

# If 'State' is categorical, you need to encode it
X = pd.get_dummies(X, columns=['STATE'], drop_first=True)

# Check the processed data
print(X.head())

   YEAR     RH  MAX_TEMP  MIN_TEMP  PRECIPITATION  STATE_FCT  STATE_KOGI  \
0  1990  77.38     35.99     14.08        1165.43      False       False   
1  1990  74.44     38.61     13.43        1270.90       True       False   
2  1990  77.50     36.12     16.42         949.22      False        True   
3  1990  68.44     39.73     15.85         849.02      False       False   
4  1990  74.12     38.76     14.87        1244.53      False       False   

   STATE_KWARA  STATE_NASS  STATE_NIGER  STATE_PLATEAU  
0        False       False        False          False  
1        False       False        False          False  
2        False       False        False          False  
3         True       False        False          False  
4        False        True        False          False  


In [12]:
# Retain 'Year' for trend analysis
year_column = data['YEAR']

#Spliting Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

#Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the hyperparameter grid for tuning
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Perform hyperparameter tuning using GridSearchCV
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', verbosity=0)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

# Train the model with the best parameters
best_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params)
best_model.fit(X_train, y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best parameters found: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}


In [9]:
# Evaluate the model
y_pred = best_model.predict(X_test)
test_accuracy = best_model.score(X_test, y_test)
print("Test accuracy: {:.2f}".format(test_accuracy))

Test accuracy: 0.06


In [10]:
# Compute metrics
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"Root Mean Square Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² (Goodness of Fit): {r2:.2f}")

NameError: name 'mean_absolute_percentage_error' is not defined

In [11]:
# Predict on the test data
y_pred = best_model.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R²)
r2 = r2_score(y_test, y_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

# Print the results
print(f"MAPE: {mape:.4f}")
print(f"MSE: {mse:.4f}")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

NameError: name 'mean_absolute_percentage_error' is not defined

In [None]:
# Plot feature importances
importances = best_model.feature_importances_
features = X.columns
indices = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Importance')
plt.show()