In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler



In [4]:
#Pick whether you wante the data file prior to encoding or after based on whether you want to sample the data before running the model.

##------------------------------------------------##

#Import the Preprocessed and NOT Encoded file from GitHub

url = 'https://media.githubusercontent.com/media/smyoung88/DA_G2_Final_Project/main/Resources/Accident_Data_ETL_Not_Encoded.csv'
data = pd.read_csv(url)

##------------------------------------------------##

#import the preprocessed and encoded data:

#Local to Local
# data = pd.read_csv('Accident_Data_ETL_Not_Encoded.csv')

#G-Drive to Colabs
# from google.colab import drive
# drive.mount('/content/drive')
# path = '/content/drive/MyDrive/ML_data_and_code/Accident_Data_ETL_Not_Encoded.csv'
# data = pd.read_csv(path)

##------------------------------------------------##

#import the preprocessed but NOT ENCODED data (for modifying prior to encoding) 

#Local to Local:
# data = pd.read_csv('Accident_Data_ETL_Not_Encoded.csv)

#G-Drive to Colabs
# from google.colab import drive
# drive.mount('/content/drive')
# path = '/content/drive/MyDrive/ML_data_and_code/Accident_Data_ETL_Not_Encoded.csv'
# data = pd.read_csv(path)

In [5]:
# Managing Features

# Option to select a specific features and drop it
# dropped_feature = ''
# data = data.drop(columns = [dropped_feature], axis=1)

In [6]:
#Date/Time features were left as number in case we wanted to create subsets of data based on these condtions before they were converted 
#to objects and encoded. Here they are finally converted to object in preparation for encoding. 
data['Year'] = data['Year'].astype(object)
data['Month'] = data['Month'].astype(object)
data['Day'] = data['Day'].astype(object)
data['Hour'] = data['Hour'].astype(object)

#Setting accidents equal to data so that it can be modified moving forward
accidents = data.copy()

In [23]:
#optional palce to subselect based on state for training and testing specific to a state.

# State is set to null unless otherwise changed
State = " "

# accidents = accidents[accidents['State'] == State]
# State_Size = len(accidents.index)
# accidents

In [8]:
#creating a sampled subset of the 2.9M rows of data for ease of testig the code and proving functionality of the ML model. 

#Setting maximum of 100,000 entries in the sample.
# if len(accidents.index) > 100000:
#     sample_size = 100000
# else: 
#     sample_size = len(accidents.index)

#Setting a specific sample size
sample_size = 20000

#Calling for the sample to be created
accidents = accidents.sample(n = sample_size, replace=True, random_state=1)
accidents

Unnamed: 0,Severity,Side,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Day/Night,Year,Month,Day,Hour,Weather_Condition_Wind,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,Weather_Condition_Heavy_Snow,Weather_Condition_Light_Rain,Weather_Condition_Light_Snow,Weather_Condition_Mist,Weather_Condition_Mostly_Cloudy,Weather_Condition_Partly_Cloudy,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
128037,2,0,TX,80F:89F,40-49%,28-30,9-10,S,10-19,0-1,Day,2017,11,4,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
491755,3,0,CA,60F:69F,50-59%,28-30,9-10,Calm,0-9,0-1,Night,2018,11,4,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2568076,2,0,PA,30F:39F,60-69%,30-32,9-10,SE,0-9,0-1,Night,2020,12,20,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
491263,2,0,IL,30F:39F,80-89%,30-32,9-10,SW,0-9,0-1,Day,2018,2,23,14,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
836489,2,1,CA,50F:59F,80-89%,28-30,9-10,N,0-9,0-1,Day,2017,10,4,7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047602,2,1,CA,60F:69F,10-19%,30-32,9-10,NW,0-9,0-1,Day,2019,10,30,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2111024,2,0,TX,90F:99F,50-59%,28-30,9-10,S,0-9,0-1,Day,2019,6,20,13,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1704524,2,1,MI,40F:49F,90-100%,30-32,8-9,Calm,0-9,0-1,Day,2017,6,8,6,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1763031,3,0,CA,80F:89F,50-59%,28-30,9-10,Calm,0-9,0-1,Day,2018,8,4,16,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
#Encoding Obj columns using OneHotEncoder:

#Identifying the 'object' columns to encode
type_cat = accidents.dtypes[accidents.dtypes == 'object'].index.tolist()

#Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(accidents[type_cat].astype(str)))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(type_cat)

#resetting the index on the sample to match the indices for the sample and the enocoded df so that they are both numbered 1-250,000.
accidents2 = accidents.reset_index(drop=True)

#Merge one-hot encoded features and drop the originals
accidents = accidents2.merge(encode_df, left_index=True, right_index=True)

#Dropping the original columns after encoding
accidents = accidents.drop(type_cat,1)
accidents

Unnamed: 0,Severity,Side,Weather_Condition_Wind,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,Weather_Condition_Heavy_Snow,Weather_Condition_Light_Rain,Weather_Condition_Light_Snow,Weather_Condition_Mist,Weather_Condition_Mostly_Cloudy,Weather_Condition_Partly_Cloudy,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,State_AL,...,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_3,Day_30,Day_31,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Hour_0,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_2,Hour_20,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
19998,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Moving the encoded dataset into the ML model:

In [10]:
# Define features set
X = accidents.copy()
X = X.drop("Severity", axis=1)
X.head()

Unnamed: 0,Side,Weather_Condition_Wind,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,Weather_Condition_Heavy_Snow,Weather_Condition_Light_Rain,Weather_Condition_Light_Snow,Weather_Condition_Mist,Weather_Condition_Mostly_Cloudy,Weather_Condition_Partly_Cloudy,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,State_AL,State_AR,...,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_3,Day_30,Day_31,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Hour_0,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_2,Hour_20,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
# Define target vectors
y = accidents["Severity"].values

In [12]:
# Train/Test Splitting and scaling the data

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Setting Model Parameters and then running the models. 

In [13]:
#finding the ideal number of features

#Call and Fit the model 
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit_transform(X_train, y_train)

#identify the number of features that have importance above the mean importance level.
selected_feat = X_train.columns[(sel.get_support())]
print(len(selected_feat))

#show the important features
print(selected_feat)


109
Index(['Side', 'Weather_Condition_Cloudy', 'Weather_Condition_Fair',
       'Weather_Condition_Light_Rain', 'Weather_Condition_Mostly_Cloudy',
       'Weather_Condition_Partly_Cloudy', 'Crossing', 'Junction',
       'Traffic_Signal', 'State_CA',
       ...
       'Hour_14', 'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19',
       'Hour_6', 'Hour_7', 'Hour_8', 'Hour_9'],
      dtype='object', length=109)


In [14]:
# Setting the number of features to be equal to those features above the median importance or a maximum of 100 features.
if len(selected_feat) < 100:
    feature_num = len(selected_feat)
else:
    feature_num = 100
feature_num

# Selecting a specifc number of features. 
# feature_num = 200

100

Gradient Boosted Random Forest Model:

In [15]:
#Counting the distribution of target values to determine representation %'s. 
GB_Counts = Counter(y)
print(GB_Counts)
print('---------------------------')

#Run the model
GB_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
GB_classifier.fit(X_train_scaled, y_train)

# Calculating the imporance of features 
# pd.options.display.max_rows = 200
# feature_importances = pd.DataFrame(GB_classifier.feature_importances_,
#                                    index = X_train.columns,
#                                     columns=['importance']).sort_values('importance', ascending=False)
# print(feature_importances)

# Make Prediction
GB_predictions = GB_classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": GB_predictions, "Actual": y_test}).head(20)

# Calculating the accuracy score
GB_acc_score = accuracy_score(y_test, GB_predictions)
print(f"Accuracy Score : {GB_acc_score}")
print('---------------------------')

# Generate the confusion matrix
GB_cm = confusion_matrix(y_test, GB_predictions)
print(GB_cm)
print('---------------------------')

# Print the imbalanced classification report
RFM_classreport = classification_report_imbalanced(y_test, GB_predictions)
print(classification_report_imbalanced(y_test, GB_predictions))



Counter({2: 14775, 3: 4240, 4: 792, 1: 193})
---------------------------
Accuracy Score : 0.7278
---------------------------
[[  11   46    2    0]
 [  11 3431  215   32]
 [   4  850  190   15]
 [   0  158   28    7]]
---------------------------
                   pre       rec       spe        f1       geo       iba       sup

          1       0.42      0.19      1.00      0.26      0.43      0.17        59
          2       0.76      0.93      0.20      0.84      0.43      0.20      3689
          3       0.44      0.18      0.94      0.25      0.41      0.16      1059
          4       0.13      0.04      0.99      0.06      0.19      0.03       193

avg / total       0.67      0.73      0.39      0.68      0.41      0.18      5000



Testing Resampling Methods To Maximize Accuracy

Random Oversampling:

In [16]:
# Oversampling of lower represented variables with Random Oversampling

# implement random oversampling
ros = RandomOverSampler(random_state=1)
X_randomoversampled, y_randomoversampled = ros.fit_resample(X_train, y_train)

ROS_Counts = Counter(y_randomoversampled)
print(ROS_Counts)
print('---------------------------')

#Run the random oversampling model
ROS_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
ROS_classifier.fit(X_randomoversampled, y_randomoversampled)

# Make Prediction with random oversampling
ROS_predictions = ROS_classifier.predict(X_randomoversampled)
pd.DataFrame({"Prediction": ROS_predictions, "Actual": y_randomoversampled}).head(20)

# Calculating the accuracy score of random oversampling
ROS_acc_score = accuracy_score(y_randomoversampled, ROS_predictions)
print(f"Accuracy Score : {ROS_acc_score}")
print('---------------------------')

# Generate the confusion matrix for random oversampling
cm = confusion_matrix(y_randomoversampled, ROS_predictions)
print(cm)
print('---------------------------')

# Print the imbalanced classification report for random oversampling
ROS_classreport = classification_report_imbalanced(y_randomoversampled, ROS_predictions)
print(classification_report_imbalanced(y_randomoversampled, ROS_predictions))



Counter({2: 11086, 3: 11086, 1: 11086, 4: 11086})
---------------------------




Accuracy Score : 0.8226592098141801
---------------------------
[[11086     0     0     0]
 [  273  7083  2663  1067]
 [  130  1521  8394  1041]
 [   18   438   713  9917]]
---------------------------
                   pre       rec       spe        f1       geo       iba       sup

          1       0.96      1.00      0.99      0.98      0.99      0.99     11086
          2       0.78      0.64      0.94      0.70      0.78      0.58     11086
          3       0.71      0.76      0.90      0.73      0.82      0.67     11086
          4       0.82      0.89      0.94      0.86      0.92      0.83     11086

avg / total       0.82      0.82      0.94      0.82      0.88      0.77     44344



SMOTE Oversampling:

In [17]:
# Oversampling of lower represented variables with SMOTE Oversampling

# SMOTE may not work with some smaller sample sizes and will pass by if the sample fails to meet the criteria.
SMOTE_counts = " "

try:
  #Resampling with SMOTE Oversampling:
  X_SMOTEresampled, y_SMOTEresampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
      X_train, y_train
  )
  SMOTE_Counts = Counter(y_SMOTEresampled)
  print(SMOTE_Counts)
  print('---------------------------')

  #Run the SMOTE oversampling model
  SMOTE_classifier = GradientBoostingClassifier(n_estimators=100,
                                          learning_rate=0.5,
                                          max_features=feature_num,
                                          max_depth=3,
                                          random_state=0)

  # Fit the model
  SMOTE_classifier.fit(X_SMOTEresampled, y_SMOTEresampled)

  # Make Prediction with SMOTE oversampling
  SMOTE_predictions = SMOTE_classifier.predict(X_SMOTEresampled)
  pd.DataFrame({"Prediction": SMOTE_predictions, "Actual": y_SMOTEresampled}).head(20)

  # Calculating the accuracy score of SMOTE oversampling
  SMOTE_acc_score = accuracy_score(y_SMOTEresampled, SMOTE_predictions)
  print(f"Accuracy Score : {SMOTE_acc_score}")
  print('---------------------------')

  # Generate the confusion matrix for SMOTE oversampling
  SMOTE_cm = confusion_matrix(y_SMOTEresampled, SMOTE_predictions)
  print(SMOTE_cm)
  print('---------------------------')

  # Print the imbalanced classification report for SMOTE oversampling
  SMOTE_classreport = classification_report_imbalanced(y_SMOTEresampled, SMOTE_predictions)
  print(classification_report_imbalanced(y_SMOTEresampled, SMOTE_predictions))
except:
  pass




Counter({2: 11086, 3: 11086, 1: 11086, 4: 11086})
---------------------------
Accuracy Score : 0.9113747068374527
---------------------------
[[11050    34     2     0]
 [   21 10311   718    36]
 [   24  1868  8624   570]
 [    2   411   244 10429]]
---------------------------
                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      1.00      1.00      1.00      1.00      1.00     11086
          2       0.82      0.93      0.93      0.87      0.93      0.87     11086
          3       0.90      0.78      0.97      0.83      0.87      0.74     11086
          4       0.95      0.94      0.98      0.94      0.96      0.92     11086

avg / total       0.91      0.91      0.97      0.91      0.94      0.88     44344



Random Undersampling:

In [18]:
# Undersample the data using `RandomUnderSampler`
rus = RandomUnderSampler(random_state=1)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)
RUS_Counts = Counter(y_undersampled)
print(RUS_Counts)
print('---------------------------')

#Run the Random Undersampling model
RUS_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
RUS_classifier.fit(X_undersampled, y_undersampled)

# Make Prediction with random undersampling
RUS_predictions = RUS_classifier.predict(X_undersampled)
pd.DataFrame({"Prediction": RUS_predictions, "Actual": y_undersampled}).head(20)

# Calculating the accuracy score of random undersampling
RUS_acc_score = accuracy_score(y_undersampled, RUS_predictions)
print(f"Accuracy Score : {RUS_acc_score}")
print('---------------------------')

# Generate the confusion matrix for SMOTE oversampling
RUS_cm = confusion_matrix(y_undersampled, RUS_predictions)
print(RUS_cm)
print('---------------------------')

# Print the imbalanced classification report for Random Undersampling
RUS_classreport = classification_report_imbalanced(y_undersampled, RUS_predictions)
print(classification_report_imbalanced(y_undersampled, RUS_predictions))

Counter({1: 134, 2: 134, 3: 134, 4: 134})
---------------------------




Accuracy Score : 1.0
---------------------------
[[134   0   0   0]
 [  0 134   0   0]
 [  0   0 134   0]
 [  0   0   0 134]]
---------------------------
                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      1.00      1.00      1.00      1.00      1.00       134
          2       1.00      1.00      1.00      1.00      1.00      1.00       134
          3       1.00      1.00      1.00      1.00      1.00      1.00       134
          4       1.00      1.00      1.00      1.00      1.00      1.00       134

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       536



SMOTEENN Combination Oversampling/Undersampling:

In [19]:
# Use the SMOTEENN technique to perform combination of oversampling and undersampling on the data
SMOTEEENN_Counts = " "

# SMOTENN may not work with some smaller sample sizes and will pass by if the sample fails to meet the criteria.
try: 
  # Count the resampled classes
  smote_enn = SMOTEENN(random_state=0)
  X_SMOTEENNresampled, y_SMOTEENNresampled = smote_enn.fit_resample(X, y)
  SMOTEENN_Counts = Counter(y_SMOTEENNresampled)
  print(SMOTEENN_Counts)
  print('---------------------------')

  #Run the model
  SMOTEENN_classifier = GradientBoostingClassifier(n_estimators=100,
                                          learning_rate=0.5,
                                          max_features=feature_num,
                                          max_depth=3,
                                          random_state=0)

  # Fit the model
  SMOTEENN_classifier.fit(X_SMOTEENNresampled, y_SMOTEENNresampled)

  # Make Prediction
  SMOTEENN_predictions = SMOTEENN_classifier.predict(X_SMOTEENNresampled)
  pd.DataFrame({"Prediction": SMOTEENN_predictions, "Actual": y_SMOTEENNresampled}).head(20)

  # Calculating the accuracy score
  SMOTEENN_acc_score = accuracy_score(y_SMOTEENNresampled, SMOTEENN_predictions)
  print(f"Accuracy Score : {SMOTEENN_acc_score}")
  print('---------------------------')

  # Generate the confusion matrix
  cm = confusion_matrix(y_SMOTEENNresampled, SMOTEENN_predictions)
  print(cm)
  print('---------------------------')

  # Print the imbalanced classification report for SMOTEENN resampling
  SMOTEENN_classreport = classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions)
  print(classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions))
except:
  pass



Counter({1: 14775, 4: 14772, 3: 13647, 2: 2717})
---------------------------
Accuracy Score : 0.9497723857027728
---------------------------
[[14766     1     7     1]
 [   21  2301   359    36]
 [   70   185 12548   844]
 [   27    90   665 13990]]
---------------------------
                   pre       rec       spe        f1       geo       iba       sup

          1       0.99      1.00      1.00      1.00      1.00      1.00     14775
          2       0.89      0.85      0.99      0.87      0.92      0.83      2717
          3       0.92      0.92      0.97      0.92      0.94      0.89     13647
          4       0.94      0.95      0.97      0.94      0.96      0.92     14772

avg / total       0.95      0.95      0.98      0.95      0.96      0.93     45911



Output Report:

In [26]:
pd.options.display.max_rows = None
print(f'OVERVIEW:')
if State == " ":
  print(f'Lower 48 States')
elif State != " ":
  print(f'State: {State}')
else:
  pass
print('----------------------')
if State == " ":
  print(f'Sample Size: {sample_size}')
  print(f'Selected Features: {feature_num}')
else:
    print(f'State Total Entries: {State_Size}')
print(f'---------------------')
print(f'Graadient Boosted Counts: {GB_Counts}')
print(f'Gradient Boosted RFM Accuracy: {GB_acc_score}')
print(f'Gradient Boosted RFM Class Report:')
print(f'{RFM_classreport}')
print(f'---------------------')
print(f'Random Over Sampling Counts: {ROS_Counts}')
print(f'Random Over Sampling Accuracy: {ROS_acc_score}')
print(f'Random Over Sampling Class Report:')
print(f'{ROS_classreport}')
print(f'---------------------')
if SMOTE_Counts == " ":
  print(f'SMOTE Resampling could not be performed due to insufficient entries')
else:
  print(f'SMOTE Counts: {SMOTE_Counts}')
  print(f'SMOTE Over Sampling Accuracy: {SMOTE_acc_score}')
  print(f'SMOTE Over Sampling Class Report:')
  print(f'{SMOTE_classreport}')
print(f'---------------------')
print(f'Random Under Sampling Counts: {RUS_Counts}')
print(f'Random Undersampling Accuracy: {RUS_acc_score}')
print(f'Random Undersampling Class Report:')
print(f'{RUS_classreport}')
print(f'---------------------')
if SMOTEENN_Counts == " ":
  print(f'SMOTEENN Resampling could not be performed due to insufficient entries')
else:
  print(f'SMOTEENN Counts: {SMOTEENN_Counts}')
  print(f'SMOTEENN Resampling Accuracy: {SMOTEENN_acc_score}')
  print(f'SMOTEENN Class Report:')
  print(f'{SMOTEENN_classreport}')

OVERVIEW:
Lower 48 States
----------------------
Sample Size: 20000
Selected Features: 100
---------------------
Graadient Boosted Counts: Counter({2: 14775, 3: 4240, 4: 792, 1: 193})
Gradient Boosted RFM Accuracy: 0.7278
Gradient Boosted RFM Class Report:
                   pre       rec       spe        f1       geo       iba       sup

          1       0.42      0.19      1.00      0.26      0.43      0.17        59
          2       0.76      0.93      0.20      0.84      0.43      0.20      3689
          3       0.44      0.18      0.94      0.25      0.41      0.16      1059
          4       0.13      0.04      0.99      0.06      0.19      0.03       193

avg / total       0.67      0.73      0.39      0.68      0.41      0.18      5000

---------------------
Random Over Sampling Counts: Counter({2: 11086, 3: 11086, 1: 11086, 4: 11086})
Random Over Sampling Accuracy: 0.8226592098141801
Random Over Sampling Class Report:
                   pre       rec       spe        f1    