In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler

In [2]:
#importing mostly pre-processed data from the Accidents_ETL 
data = pd.read_csv('Accidents_Preprocessed1.csv')

In [3]:
data.dtypes

Severity                            int64
Side                                int64
City                               object
County                             object
State                              object
Temperature(F)                     object
Humidity(%)                        object
Pressure(in)                       object
Visibility(mi)                     object
Wind_Direction                     object
Wind_Speed(mph)                    object
Precipitation(in)                  object
Day/Night                          object
Year                                int64
Month                               int64
Day                                 int64
Hour                                int64
Weather_Condition_Wind              int64
Bump                                int64
Crossing                            int64
Give_Way                            int64
Junction                            int64
No_Exit                             int64
Railway                           

In [4]:
#Date/Time features were left as number in case we wanted to create subsets of data based on these condtions before they were converted to objects and encoded. Here they are finally converted to object in preparation for encoding. 
#Cities are also dropped to reduce the processing power required to handle an additional 11,000 columns of data. 

data['Year'] = data['Year'].astype(object)
data['Month'] = data['Month'].astype(object)
data['Day'] = data['Day'].astype(object)
data['Hour'] = data['Hour'].astype(object)
#data = data.drop(columns = ['City'], axis=1)
#data = data.drop(columns = ['County'], axis=1)

In [5]:
#creating the accidents df based on the imported and processed data.abs
accidents = data.copy()

In [6]:
#confirming the layout and size of the accidents df
accidents

Unnamed: 0,Severity,Side,City,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
0,2,0,"Greenville,SC","Greenville,SC",SC,70F:79F,50-59%,28-30,9-10,N,...,0,0,0,0,0,0,0,0,0,0
1,2,0,"Charlotte,NC","Mecklenburg,NC",NC,70F:79F,60-69%,28-30,9-10,Variable,...,0,0,0,0,0,0,0,0,0,0
2,2,0,"Los Gatos,CA","Santa Clara,CA",CA,50F:59F,70-79%,30-32,9-10,W,...,0,0,0,0,0,0,0,0,0,0
3,2,0,"Carson City,NV","Douglas,NV",NV,50F:59F,10-19%,30-32,9-10,SW,...,0,0,0,0,0,0,0,0,0,0
4,3,0,"Fort Lauderdale,FL","Broward,FL",FL,80F:89F,80-89%,28-30,9-10,SE,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2906376,2,1,"Houston,TX","Harris,TX",TX,80F:89F,60-69%,30-32,8-9,Variable,...,0,0,0,0,0,0,0,0,0,0
2906377,2,0,"Colton,CA","San Bernardino,CA",CA,40F:49F,70-79%,30-32,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
2906378,2,1,"Miami,FL","Miami-Dade,FL",FL,70F:79F,80-89%,28-30,9-10,NW,...,0,0,0,0,0,0,0,0,0,0
2906379,2,0,"Salt Lake City,UT","Salt Lake,UT",UT,20F:29F,80-89%,24-26,9-10,SE,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#optional palce to subselect based on state for training and testing specific to a state.

#data = data[data['State'] == 'CA']


In [7]:
#creating a sampled subset of the 2.9M rows of data for ease of testig the code and proving functionality of the ML model. 
accidents = accidents.sample(n = 50000, replace=True, random_state=1)
accidents

Unnamed: 0,Severity,Side,City,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
128037,3,0,"San Francisco,CA","San Francisco,CA",CA,50F:59F,50-59%,30-32,9-10,W,...,0,0,0,0,0,0,0,0,0,0
491755,2,0,"Tillamook,OR","Tillamook,OR",OR,30F:39F,90-100%,28-30,6-7,Calm,...,0,0,0,0,0,0,0,0,0,1
2568076,2,0,"Rancho Cucamonga,CA","San Bernardino,CA",CA,40F:49F,70-79%,28-30,9-10,N,...,0,0,0,0,0,0,0,0,0,0
491263,2,0,"La Puente,CA","Los Angeles,CA",CA,60F:69F,70-79%,30-32,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
836489,2,0,"San Martin,CA","Santa Clara,CA",CA,70F:79F,40-49%,28-30,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246255,2,0,"Mount Shasta,CA","Siskiyou,CA",CA,50F:59F,50-59%,26-28,9-10,Variable,...,0,0,0,0,0,0,0,0,0,0
2766330,2,0,"Leavenworth,WA","Chelan,WA",WA,30F:39F,70-79%,28-30,9-10,SE,...,0,0,0,0,0,0,0,0,0,0
1031261,2,0,"Charlotte,NC","Mecklenburg,NC",NC,80F:89F,40-49%,28-30,9-10,W,...,0,0,0,0,0,0,0,0,0,0
1523602,2,0,"Bradenton,FL","Manatee,FL",FL,80F:89F,70-79%,28-30,9-10,W,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#Saving the pre-encoded sample data set

# output_data_file = "sample_data_not_encoded.csv"
# accidents.to_csv(output_data_file, index=False)

In [9]:
#Encoding Obj columns using OneHotEncoder:

#Identifying the 'object' columns to encode

type_cat = accidents.dtypes[accidents.dtypes == 'object'].index.tolist()
type_cat


['City',
 'County',
 'State',
 'Temperature(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Precipitation(in)',
 'Day/Night',
 'Year',
 'Month',
 'Day',
 'Hour']

In [10]:
#Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(accidents[type_cat].astype(str)))



In [11]:
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(type_cat)
encode_df.head()



Unnamed: 0,"City_Abbeville,SC","City_Aberdeen,MD","City_Aberdeen,NC","City_Abingdon,MD","City_Abingdon,VA","City_Abington,PA","City_Absecon,NJ","City_Acampo,CA","City_Accokeek,MD","City_Acme,PA",...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
#resetting the index on the sample to match the indices for the sample and the enocoded df so that they are both numbered 1-250,000.
accidents2 = accidents.reset_index(drop=True)

In [13]:
#visualizing accdients2 to confirm 
accidents2

Unnamed: 0,Severity,Side,City,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
0,3,0,"San Francisco,CA","San Francisco,CA",CA,50F:59F,50-59%,30-32,9-10,W,...,0,0,0,0,0,0,0,0,0,0
1,2,0,"Tillamook,OR","Tillamook,OR",OR,30F:39F,90-100%,28-30,6-7,Calm,...,0,0,0,0,0,0,0,0,0,1
2,2,0,"Rancho Cucamonga,CA","San Bernardino,CA",CA,40F:49F,70-79%,28-30,9-10,N,...,0,0,0,0,0,0,0,0,0,0
3,2,0,"La Puente,CA","Los Angeles,CA",CA,60F:69F,70-79%,30-32,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
4,2,0,"San Martin,CA","Santa Clara,CA",CA,70F:79F,40-49%,28-30,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,"Mount Shasta,CA","Siskiyou,CA",CA,50F:59F,50-59%,26-28,9-10,Variable,...,0,0,0,0,0,0,0,0,0,0
49996,2,0,"Leavenworth,WA","Chelan,WA",WA,30F:39F,70-79%,28-30,9-10,SE,...,0,0,0,0,0,0,0,0,0,0
49997,2,0,"Charlotte,NC","Mecklenburg,NC",NC,80F:89F,40-49%,28-30,9-10,W,...,0,0,0,0,0,0,0,0,0,0
49998,2,0,"Bradenton,FL","Manatee,FL",FL,80F:89F,70-79%,28-30,9-10,W,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Merge one-hot encoded features and drop the originals
accidents = accidents2.merge(encode_df, left_index=True, right_index=True)
accidents

Unnamed: 0,Severity,Side,City,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,3,0,"San Francisco,CA","San Francisco,CA",CA,50F:59F,50-59%,30-32,9-10,W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,"Tillamook,OR","Tillamook,OR",OR,30F:39F,90-100%,28-30,6-7,Calm,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,0,"Rancho Cucamonga,CA","San Bernardino,CA",CA,40F:49F,70-79%,28-30,9-10,N,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0,"La Puente,CA","Los Angeles,CA",CA,60F:69F,70-79%,30-32,9-10,Calm,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0,"San Martin,CA","Santa Clara,CA",CA,70F:79F,40-49%,28-30,9-10,Calm,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,"Mount Shasta,CA","Siskiyou,CA",CA,50F:59F,50-59%,26-28,9-10,Variable,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,2,0,"Leavenworth,WA","Chelan,WA",WA,30F:39F,70-79%,28-30,9-10,SE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,2,0,"Charlotte,NC","Mecklenburg,NC",NC,80F:89F,40-49%,28-30,9-10,W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,2,0,"Bradenton,FL","Manatee,FL",FL,80F:89F,70-79%,28-30,9-10,W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
#Dropping the original columns after encoding
accidents = accidents.drop(type_cat,1)
accidents

Unnamed: 0,Severity,Side,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,3,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,0,0,0,0,0,1,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
#Saving the final sampled and encoded dataset that will be used in the ML model.
# output_data_file = "Accidents_100K_withCities_Preprocessed_Encoded.csv"
# accidents.to_csv(output_data_file, index=False)


Moving the encoded dataset into the ML model:

In [17]:
# Define features set
X = accidents.copy()
X = X.drop("Severity", axis=1)
X.head()

Unnamed: 0,Side,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
# Define target vector
y = accidents["Severity"].values

In [19]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:

#Testing to see which learning rate would provide the best outcomes for the ML model:

# Create a classifier object
# learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
# for learning_rate in learning_rates:
#     classifier = GradientBoostingClassifier(n_estimators=100,
#                                             learning_rate=learning_rate,
#                                             max_features=5,
#                                             max_depth=3,
#                                             random_state=0)

#     # Fit the model
#     classifier.fit(X_train_scaled, y_train)
#     print("Learning rate: ", learning_rate)

#     # Score the model
#     print("Accuracy score (training): {0:.3f}".format(
#         classifier.score(
#             X_train_scaled,
#             y_train)))
#     print("Accuracy score (validation): {0:.3f}".format(
#         classifier.score(
#             X_test_scaled,
#             y_test)))
#     print()

In [21]:
#finding the ideal number of features

#Call and Fit the model 
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit_transform(X_train, y_train)

#identify the number of features that have importance above the mean importance level.
selected_feat = X_train.columns[(sel.get_support())]
print(len(selected_feat))

#show the important features
print(selected_feat)




274
Index(['Side', 'Weather_Condition_Wind', 'Crossing', 'Give_Way', 'Junction',
       'Railway', 'Station', 'Stop', 'Traffic_Signal',
       'Weather_Condition_Cloudy',
       ...
       'Hour_21', 'Hour_22', 'Hour_23', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6',
       'Hour_7', 'Hour_8', 'Hour_9'],
      dtype='object', length=274)


In [22]:
calc_num = len(selected_feat)
manual_num = 100
feature_num = manual_num

In [23]:
#Run the model
GB_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
GB_classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(learning_rate=0.5, max_features=100, random_state=0)

In [24]:
pd.options.display.max_rows = 200
import pandas as pd
feature_importances = pd.DataFrame(GB_classifier.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
State_WA,0.142733
"County_King,WA",0.135037
Year_2020,0.060466
Side,0.058058
"County_Spokane,WA",0.029884
...,...
"County_Monongalia,WV",0.000000
"County_Mono,CA",0.000000
"County_Monmouth,NJ",0.000000
"County_Moniteau,MO",0.000000


In [25]:
# Make Prediction
GB_predictions = GB_classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": GB_predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,2,2
1,2,2
2,2,2
3,2,2
4,3,2
5,2,2
6,2,3
7,2,2
8,2,2
9,2,2


In [26]:
# Calculating the accuracy score
GB_acc_score = accuracy_score(y_test, GB_predictions)
print(f"Accuracy Score : {GB_acc_score}")

Accuracy Score : 0.73876


In [27]:
# Generate the confusion matrix
GB_cm = confusion_matrix(y_test, GB_predictions)
GB_cm

array([[   63,   196,    12,     0],
       [  101, 17175,   977,    63],
       [   24,  4182,  1157,    23],
       [   10,   822,   121,    74]])

In [28]:
# Print the imbalanced classification report
RFM_classreport = classification_report_imbalanced(y_test, GB_predictions)
print(classification_report_imbalanced(y_test, GB_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.32      0.23      0.99      0.27      0.48      0.21       271
          2       0.77      0.94      0.22      0.84      0.46      0.22     18316
          3       0.51      0.21      0.94      0.30      0.45      0.19      5386
          4       0.46      0.07      1.00      0.12      0.27      0.07      1027

avg / total       0.69      0.74      0.42      0.69      0.45      0.21     25000



Testing Resampling Methods To Maximize Accuracy

In [29]:
#Counting the distribution of target values to determine representation %'s. 
Counter(y)

Counter({3: 21536, 2: 73330, 1: 1003, 4: 4131})

Random Oversampling:

In [30]:
#Oversampling of lower represented variables with Random Oversampling

# implement random oversampling
ros = RandomOverSampler(random_state=1)
X_randomoversampled, y_randomoversampled = ros.fit_resample(X_train, y_train)

Counter(y_randomoversampled)

Counter({3: 55014, 2: 55014, 4: 55014, 1: 55014})

In [31]:
#Run the random oversampling model
ROS_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
ROS_classifier.fit(X_randomoversampled, y_randomoversampled)

GradientBoostingClassifier(learning_rate=0.5, max_features=100, random_state=0)

In [32]:
# Make Prediction with random oversampling
ROS_predictions = ROS_classifier.predict(X_randomoversampled)
pd.DataFrame({"Prediction": ROS_predictions, "Actual": y_randomoversampled}).head(20)

Unnamed: 0,Prediction,Actual
0,3,3
1,2,2
2,2,2
3,3,3
4,3,2
5,3,2
6,2,2
7,2,2
8,2,2
9,4,2


In [33]:
# Calculating the accuracy score of random oversampling
ROS_acc_score = accuracy_score(y_randomoversampled, ROS_predictions)
print(f"Accuracy Score : {ROS_acc_score}")

Accuracy Score : 0.7091013196640855


In [34]:
# Generate the confusion matrix for random oversampling
cm = confusion_matrix(y_randomoversampled, ROS_predictions)
cm

array([[54491,   225,   224,    74],
       [ 3241, 31494, 13213,  7066],
       [ 3157,  9022, 34380,  8455],
       [ 3110,  6537,  9690, 35677]])

In [35]:
# Print the imbalanced classification report for random oversampling
ROS_classreport = classification_report_imbalanced(y_randomoversampled, ROS_predictions)
print(classification_report_imbalanced(y_randomoversampled, ROS_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.85      0.99      0.94      0.92      0.97      0.94     55014
          2       0.67      0.57      0.90      0.62      0.72      0.50     55014
          3       0.60      0.62      0.86      0.61      0.73      0.52     55014
          4       0.70      0.65      0.91      0.67      0.77      0.57     55014

avg / total       0.70      0.71      0.90      0.70      0.80      0.63    220056



SMOTE Oversampling:

In [36]:
#Resampling with SMOTE Oversampling:
X_SMOTEresampled, y_SMOTEresampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_SMOTEresampled)

Counter({3: 55014, 2: 55014, 4: 55014, 1: 55014})

In [37]:
#Run the SMOTE oversampling model
SMOTE_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
SMOTE_classifier.fit(X_SMOTEresampled, y_SMOTEresampled)

GradientBoostingClassifier(learning_rate=0.5, max_features=100, random_state=0)

In [38]:
# Make Prediction with SMOTE oversampling
SMOTE_predictions = SMOTE_classifier.predict(X_SMOTEresampled)
pd.DataFrame({"Prediction": SMOTE_predictions, "Actual": y_SMOTEresampled}).head(20)

Unnamed: 0,Prediction,Actual
0,3,3
1,2,2
2,2,2
3,2,3
4,3,2
5,2,2
6,2,2
7,2,2
8,2,2
9,2,2


In [39]:
# Calculating the accuracy score of SMOTE oversampling
SMOTE_acc_score = accuracy_score(y_SMOTEresampled, SMOTE_predictions)
print(f"Accuracy Score : {SMOTE_acc_score}")

Accuracy Score : 0.8317064747155269


In [40]:
# Generate the confusion matrix for SMOTE oversampling
SMOTE_cm = confusion_matrix(y_SMOTEresampled, SMOTE_predictions)
SMOTE_cm

array([[54413,   166,   158,   277],
       [  875, 47255,  6098,   786],
       [ 1344,  9471, 36874,  7325],
       [ 1165,  2396,  6973, 44480]])

In [41]:
# Print the imbalanced classification report for SMOTE oversampling
SMOTE_classreport = classification_report_imbalanced(y_SMOTEresampled, SMOTE_predictions)
print(classification_report_imbalanced(y_SMOTEresampled, SMOTE_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.94      0.99      0.98      0.96      0.98      0.97     55014
          2       0.80      0.86      0.93      0.83      0.89      0.79     55014
          3       0.74      0.67      0.92      0.70      0.79      0.60     55014
          4       0.84      0.81      0.95      0.82      0.88      0.76     55014

avg / total       0.83      0.83      0.94      0.83      0.88      0.78    220056



Random Undersampling:

In [42]:
# Undersample the data using `RandomUnderSampler`
rus = RandomUnderSampler(random_state=1)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)
Counter(y_undersampled)

Counter({1: 732, 2: 732, 3: 732, 4: 732})

In [43]:
#Run the Random Undersampling model
RUS_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
RUS_classifier.fit(X_undersampled, y_undersampled)

GradientBoostingClassifier(learning_rate=0.5, max_features=100, random_state=0)

In [44]:
# Make Prediction with random undersampling
RUS_predictions = RUS_classifier.predict(X_undersampled)
pd.DataFrame({"Prediction": RUS_predictions, "Actual": y_undersampled}).head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [45]:
# Calculating the accuracy score of random undersampling
RUS_acc_score = accuracy_score(y_undersampled, RUS_predictions)
print(f"Accuracy Score : {RUS_acc_score}")

Accuracy Score : 0.8411885245901639


In [46]:
# Generate the confusion matrix for SMOTE oversampling
RUS_cm = confusion_matrix(y_undersampled, RUS_predictions)
RUS_cm

array([[722,   4,   2,   4],
       [ 29, 567,  85,  51],
       [ 19,  68, 599,  46],
       [ 27,  53,  77, 575]])

In [47]:
# Print the imbalanced classification report for Random Undersampling
RUS_classreport = classification_report_imbalanced(y_undersampled, RUS_predictions)
print(classification_report_imbalanced(y_undersampled, RUS_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.91      0.99      0.97      0.94      0.98      0.95       732
          2       0.82      0.77      0.94      0.80      0.85      0.72       732
          3       0.79      0.82      0.93      0.80      0.87      0.75       732
          4       0.85      0.79      0.95      0.82      0.87      0.74       732

avg / total       0.84      0.84      0.95      0.84      0.89      0.79      2928



SMOTEENN Combination Oversampling/Undersampling:

In [48]:
# Use the SMOTEENN technique to perform combination of oversampling and undersampling on the data

# Count the resampled classes
smote_enn = SMOTEENN(random_state=0)
X_SMOTEENNresampled, y_SMOTEENNresampled = smote_enn.fit_resample(X, y)
Counter(y_SMOTEENNresampled)

In [None]:
#Run the model
SMOTEENN_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
SMOTEENN_classifier.fit(X_SMOTEENNresampled, y_SMOTEENNresampled)

In [None]:
# Make Prediction
SMOTEENN_predictions = SMOTEENN_classifier.predict(X_SMOTEENNresampled)
pd.DataFrame({"Prediction": SMOTEENN_predictions, "Actual": y_SMOTEENNresampled}).head(20)

In [None]:
# Calculating the accuracy score
SMOTEENN_acc_score = accuracy_score(y_SMOTEENNresampled, SMOTEENN_predictions)
print(f"Accuracy Score : {SMOTEENN_acc_score}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_SMOTEENNresampled, SMOTEENN_predictions)
cm

In [None]:
# Print the imbalanced classification report for SMOTEENN resampling
SMOTEENN_classreport = classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions)
print(classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions))

In [None]:
print(f'OVERVIEW:')
# print(f'State: {state_name}')
print('----------------------')
print(f'# of Rows: {row_count}')
print(f'selected features: {feature_num}')
print(f'---------------------')
print(f'Gradient Boosted RFM Accuracy: {GB_acc_score}')
print(f'Gradient Boosted RFM Class Report:')
print(f'{RFM_classreport}')
print(f'Random Over Sampling Accuracy: {ROS_acc_score}')
print(f'Random Over Sampling Class Report:')
print(f'{ROS_classreport}')
print(f'SMOTE Over Sampling Accuracy: {SMOTE_acc_score}')
print(f'SMOTE Over Sampling Class Report:')
print(f'{SMOTE_classreport}')
print(f'Random Undersampling Accuracy: {RUS_acc_score}')
print(f'Random Undersampling Class Report:')
print(f'{RUS_classreport}')
print(f'SMOTEEENN Resampling Accuracy: {SMOTEENN_acc_score}')
print(f'SMOTEEENN Class Report:')
print(f'{SMOTEENN_classreport}')