In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler



In [2]:
from google.colab import drive
drive.mount('/content/drive/')

path = '/content/drive/MyDrive/ML_data_and_code/Accidents_Sample_Preprocessed_Encoded_Sampled.csv'
accidents = pd.read_csv(path)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
# Define features set
X = accidents.copy()
X = X.drop("Severity", axis=1)
# X.head()

In [5]:
# Define target vector
y = accidents["Severity"].values

In [6]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
#finding the ideal number of features

#Call and Fit the model 
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit_transform(X_train, y_train)

#identify the number of features that have importance above the mean importance level.
selected_feat = X_train.columns[(sel.get_support())]
print(len(selected_feat))

#show the important features
print(selected_feat)



273
Index(['Unnamed: 0', 'Side', 'Weather_Condition_Wind', 'Crossing', 'Junction',
       'Railway', 'Station', 'Stop', 'Traffic_Signal',
       'Weather_Condition_Cloudy',
       ...
       'Hour_21', 'Hour_22', 'Hour_23', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6',
       'Hour_7', 'Hour_8', 'Hour_9'],
      dtype='object', length=273)


In [8]:
feature_num = len(selected_feat)

In [None]:
# Use the SMOTEENN technique to perform combination of oversampling and undersampling on the data

# Count the resampled classes
smote_enn = SMOTEENN(random_state=0)
X_SMOTEENNresampled, y_SMOTEENNresampled = smote_enn.fit_resample(X, y)
Counter(y_SMOTEENNresampled)



In [None]:
#Run the model
SMOTEENN_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
SMOTEENN_classifier.fit(X_SMOTEENNresampled, y_SMOTEENNresampled)

In [None]:
# Make Prediction
SMOTEENN_predictions = SMOTEENN_classifier.predict(X_SMOTEENNresampled)
pd.DataFrame({"Prediction": SMOTEENN_predictions, "Actual": y_SMOTEENNresampled}).head(20)

In [None]:
# Calculating the accuracy score
SMOTEENN_acc_score = accuracy_score(y_SMOTEENNresampled, SMOTEENN_predictions)
print(f"Accuracy Score : {SMOTEENN_acc_score}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_SMOTEENNresampled, SMOTEENN_predictions)
cm

In [None]:
# Print the imbalanced classification report for SMOTEENN resampling
print(classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions))