In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import datetime as dt
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from imblearn.metrics import classification_report_imbalanced
from bioinfokit import visuz

In [2]:
# Import our input dataset
crash_df = pd.read_csv('Crash&RoadwayData.csv')
crash_df

Unnamed: 0,Crash_ID,Cmv_Involv_Fl,Crash_Date,Crash_Time,Crash_Speed_Limit,Wthr_Cond_ID,Light_Cond_ID,Surf_Cond_ID,Traffic_Cntl_ID,Harm_Evnt_ID,...,HY_1,HY_2,HY_3,HY_4,Corridor_ID,Road_ID,Average_AADT,RLD,PED,INT
0,14839123,N,01/01/2016,12:23 AM,80,11,4,1,20,10,...,43413.0,43788.0,42315.0,38748.0,28.0,1188.0,42837.2,1,0,0
1,14840287,N,01/01/2016,07:40 AM,80,11,1,1,11,7,...,43413.0,43788.0,42315.0,38748.0,28.0,1192.0,42837.2,1,0,0
2,14840860,N,01/03/2016,06:48 PM,80,11,4,1,1,6,...,43413.0,43788.0,42315.0,38748.0,28.0,1194.0,42837.2,0,0,0
3,14850383,N,01/07/2016,09:05 AM,80,5,1,1,20,2,...,31038.0,30351.0,28878.0,25311.0,28.0,1207.0,29863.8,0,0,0
4,14863190,N,01/12/2016,04:01 PM,65,11,1,1,5,2,...,39424.0,41579.0,39927.0,36360.0,28.0,1180.0,39841.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56175,17293867,N,09/06/2019,11:43 AM,45,11,1,1,8,2,...,10460.0,10329.0,9326.0,7479.0,14.0,489.0,9608.0,0,0,1
56176,17302636,N,09/22/2019,12:40 PM,45,11,1,1,11,2,...,10460.0,10329.0,9326.0,7479.0,14.0,489.0,9608.0,0,0,1
56177,17318171,N,09/30/2019,10:45 AM,45,2,1,2,17,2,...,10460.0,10329.0,9326.0,7479.0,14.0,492.0,9608.0,0,0,1
56178,17361764,N,10/24/2019,02:07 PM,45,12,1,1,20,2,...,10460.0,10329.0,9326.0,7479.0,14.0,493.0,9608.0,0,0,1


In [3]:
crash_df.columns.to_list()

['Crash_ID',
 'Cmv_Involv_Fl',
 'Crash_Date',
 'Crash_Time',
 'Crash_Speed_Limit',
 'Wthr_Cond_ID',
 'Light_Cond_ID',
 'Surf_Cond_ID',
 'Traffic_Cntl_ID',
 'Harm_Evnt_ID',
 'Intrsct_Relat_ID',
 'FHE_Collsn_ID',
 'Road_Relat_ID',
 'Cnty_ID',
 'City_ID',
 'Latitude',
 'Longitude',
 'Hwy_Sys',
 'Hwy_Nbr',
 'Dfo',
 'Street_Name',
 'Onsys_Fl',
 'Rural_Fl',
 'Crash_Sev_ID',
 'Day_of_Week',
 'Shldr_Width_Left',
 'Shldr_Width_Right',
 'Median_Width',
 'Nbr_Of_Lane',
 'Func_Sys_ID',
 'Adt_Curnt_Amt',
 'Adt_Curnt_Year',
 'Trk_Aadt_Pct',
 'Curve_Type_ID',
 'Curve_Lngth',
 'Cd_Degr',
 'Sus_Serious_Injry_Cnt',
 'Nonincap_Injry_Cnt',
 'Poss_Injry_Cnt',
 'Non_Injry_Cnt',
 'Unkn_Injry_Cnt',
 'Tot_Injry_Cnt',
 'Death_Cnt',
 'Year',
 'REC',
 'RIA_RTE_ID',
 'FRM_DFO',
 'TO_DFO',
 'HWY',
 'HSYS',
 'HNUM',
 'DI',
 'CO',
 'CITY',
 'RU',
 'F_SYSTEM',
 'SPD_MAX',
 'HP_MED_W',
 'NUM_LANES',
 'SUR_W',
 'ADT_YEAR',
 'ADT_CUR',
 'ADT_ADJ',
 'TRK_AADT_PCT',
 'AADT_TRUCKS',
 'LANE_WIDTH',
 'LEN_SEC',
 'LN_MILES',
 

In [4]:
# Abridge our crash data set
# Although these variables may help predict, they were dropped due to missing data: Curve_Lngth, Cd_Degr, Curve_Type_ID
crash_df2 = crash_df.drop(crash_df.columns.difference([
'Crash_Speed_Limit',
'Crash_Time',
'Crash_Date',
'Wthr_Cond_ID',
'Light_Cond_ID',
'Surf_Cond_ID',
'Traffic_Cntl_ID',
'Rural_Fl',
'Crash_Sev_ID',
'Day_of_Week',
'Shldr_Width_Left',
'Shldr_Width_Right',
'Median_Width',
'Nbr_Of_Lane',
'Trk_Aadt_Pct',
'Average_AADT',      
'FHE_Collsn_ID',
'Cmv_Involv_Fl',
'PED',
'RLD',
'INT',
'DVMT',
'DTRKVMT',
'LANE_WIDTH',
'SPD_MAX',
'Curve_Lngth',
'Func_Sys_ID',
'Harm_Evnt_ID',
'Road_Relat_ID']), axis=1)
# crash_df2.head()
crash_df2.dtypes
# frequency = crash_df2['Crash_Speed_Limit'].value_counts()
# print(frequency)

Cmv_Involv_Fl         object
Crash_Date            object
Crash_Time            object
Crash_Speed_Limit      int64
Wthr_Cond_ID           int64
Light_Cond_ID          int64
Surf_Cond_ID           int64
Traffic_Cntl_ID        int64
Harm_Evnt_ID           int64
FHE_Collsn_ID          int64
Road_Relat_ID        float64
Rural_Fl              object
Crash_Sev_ID           int64
Day_of_Week           object
Shldr_Width_Left     float64
Shldr_Width_Right    float64
Median_Width         float64
Nbr_Of_Lane          float64
Func_Sys_ID          float64
Trk_Aadt_Pct         float64
Curve_Lngth          float64
SPD_MAX              float64
LANE_WIDTH           float64
DVMT                 float64
DTRKVMT              float64
Average_AADT         float64
RLD                    int64
PED                    int64
INT                    int64
dtype: object

In [5]:
crash_df2.isna().sum()

Cmv_Involv_Fl            0
Crash_Date               0
Crash_Time               0
Crash_Speed_Limit        0
Wthr_Cond_ID             0
Light_Cond_ID            0
Surf_Cond_ID             0
Traffic_Cntl_ID          0
Harm_Evnt_ID             0
FHE_Collsn_ID            0
Road_Relat_ID            0
Rural_Fl                 0
Crash_Sev_ID             0
Day_of_Week              0
Shldr_Width_Left         0
Shldr_Width_Right        0
Median_Width             0
Nbr_Of_Lane              0
Func_Sys_ID              0
Trk_Aadt_Pct             0
Curve_Lngth          40618
SPD_MAX                  0
LANE_WIDTH               0
DVMT                     0
DTRKVMT                  0
Average_AADT             0
RLD                      0
PED                      0
INT                      0
dtype: int64

In [6]:
crash_df2['Curve_Lngth'] = crash_df2['Curve_Lngth'].replace(np.nan, 0)


In [7]:
crash_df2.isna().sum()

Cmv_Involv_Fl        0
Crash_Date           0
Crash_Time           0
Crash_Speed_Limit    0
Wthr_Cond_ID         0
Light_Cond_ID        0
Surf_Cond_ID         0
Traffic_Cntl_ID      0
Harm_Evnt_ID         0
FHE_Collsn_ID        0
Road_Relat_ID        0
Rural_Fl             0
Crash_Sev_ID         0
Day_of_Week          0
Shldr_Width_Left     0
Shldr_Width_Right    0
Median_Width         0
Nbr_Of_Lane          0
Func_Sys_ID          0
Trk_Aadt_Pct         0
Curve_Lngth          0
SPD_MAX              0
LANE_WIDTH           0
DVMT                 0
DTRKVMT              0
Average_AADT         0
RLD                  0
PED                  0
INT                  0
dtype: int64

In [8]:
# Replace unreasonable or missing data. Follow up. also change to function. 
crash_df3 = crash_df2.copy() 
crash_df3.loc[crash_df3['Crash_Speed_Limit'] <= 14,'Crash_Speed_Limit'] = np.nan
crash_df3.loc[crash_df3['Wthr_Cond_ID'] < 1,'Wthr_Cond_ID'] = np.nan
crash_df3.loc[crash_df3['Light_Cond_ID'] < 1,'Light_Cond_ID'] = np.nan
crash_df3.loc[crash_df3['Surf_Cond_ID'] < 1,'Surf_Cond_ID'] = np.nan
# crash_df3['Hwy_Nbr'] = crash_df3['Hwy_Nbr'].astype(object)
# frequency = crash_df3['Surf_Cond_ID'].value_counts()
# print(frequency)
# print(crash_df3['Crash_Speed_Limit'])
crash_df3.info()
# crash_df3.head()
# print(crash_df3['Hwy_Nbr'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56180 entries, 0 to 56179
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cmv_Involv_Fl      56180 non-null  object 
 1   Crash_Date         56180 non-null  object 
 2   Crash_Time         56180 non-null  object 
 3   Crash_Speed_Limit  44535 non-null  float64
 4   Wthr_Cond_ID       56051 non-null  float64
 5   Light_Cond_ID      56081 non-null  float64
 6   Surf_Cond_ID       56065 non-null  float64
 7   Traffic_Cntl_ID    56180 non-null  int64  
 8   Harm_Evnt_ID       56180 non-null  int64  
 9   FHE_Collsn_ID      56180 non-null  int64  
 10  Road_Relat_ID      56180 non-null  float64
 11  Rural_Fl           56180 non-null  object 
 12  Crash_Sev_ID       56180 non-null  int64  
 13  Day_of_Week        56180 non-null  object 
 14  Shldr_Width_Left   56180 non-null  float64
 15  Shldr_Width_Right  56180 non-null  float64
 16  Median_Width       561

In [9]:
# We'll drop NaN for now to see if the model works with the remaining rows. 
crash_df3 = crash_df3.dropna()

In [10]:
crash_df3['Crash_Date'] = pd.to_datetime(crash_df3['Crash_Date'])
crash_df3['Crash_Date'] = crash_df3['Crash_Date'].apply(lambda x: x.toordinal())
crash_df3['Crash_Time'] = pd.to_datetime(crash_df3['Crash_Time']).dt.hour
crash_df3

Unnamed: 0,Cmv_Involv_Fl,Crash_Date,Crash_Time,Crash_Speed_Limit,Wthr_Cond_ID,Light_Cond_ID,Surf_Cond_ID,Traffic_Cntl_ID,Harm_Evnt_ID,FHE_Collsn_ID,...,Trk_Aadt_Pct,Curve_Lngth,SPD_MAX,LANE_WIDTH,DVMT,DTRKVMT,Average_AADT,RLD,PED,INT
0,N,735964,0,80.0,11.0,4.0,1.0,20,10,1,...,17.9,1686.0,80.0,12.0,107916.700,22407.250,42837.2,1,0,0
1,N,735964,7,80.0,11.0,1.0,1.0,11,7,1,...,17.9,0.0,80.0,12.0,121325.924,25191.470,42837.2,1,0,0
2,N,735966,18,80.0,11.0,4.0,1.0,1,6,1,...,17.5,399.0,80.0,12.0,55703.386,11565.955,42837.2,0,0,0
3,N,735970,9,80.0,5.0,1.0,1.0,20,2,21,...,20.5,0.0,80.0,12.0,3340.359,796.257,29863.8,0,0,0
4,N,735975,16,65.0,11.0,1.0,1.0,5,2,10,...,17.9,0.0,80.0,12.0,1508.940,133.344,39841.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56174,N,737310,1,45.0,11.0,3.0,1.0,20,10,1,...,6.4,0.0,45.0,11.0,5505.042,349.928,9608.0,1,0,0
56175,N,737308,11,45.0,11.0,1.0,1.0,8,2,10,...,6.4,0.0,45.0,11.0,5505.042,349.928,9608.0,0,0,1
56176,N,737324,12,45.0,11.0,1.0,1.0,11,2,13,...,6.4,0.0,45.0,11.0,5505.042,349.928,9608.0,0,0,1
56177,N,737332,10,45.0,2.0,1.0,2.0,17,2,14,...,6.4,0.0,45.0,12.0,1243.074,79.016,9608.0,0,0,1


In [11]:
days_num = {
   "SUN": 1,
   "MON": 2,
   "TUE": 3,
   "WED": 4,
   "THU": 5,
   "FRI": 6,
   "SAT": 7,
}

In [12]:
crash_df3["Day_of_Week"] = crash_df3["Day_of_Week"].apply(lambda x: days_num[x])

In [13]:
le = LabelEncoder()
crash_df3 = crash_df3.copy()
# crash_df3['Hwy_Nbr'] = le.fit_transform(crash_df3['Hwy_Nbr'])
# crash_df3.head()
# print(crash_df3['Hwy_Nbr'])
crash_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44418 entries, 0 to 56178
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cmv_Involv_Fl      44418 non-null  object 
 1   Crash_Date         44418 non-null  int64  
 2   Crash_Time         44418 non-null  int64  
 3   Crash_Speed_Limit  44418 non-null  float64
 4   Wthr_Cond_ID       44418 non-null  float64
 5   Light_Cond_ID      44418 non-null  float64
 6   Surf_Cond_ID       44418 non-null  float64
 7   Traffic_Cntl_ID    44418 non-null  int64  
 8   Harm_Evnt_ID       44418 non-null  int64  
 9   FHE_Collsn_ID      44418 non-null  int64  
 10  Road_Relat_ID      44418 non-null  float64
 11  Rural_Fl           44418 non-null  object 
 12  Crash_Sev_ID       44418 non-null  int64  
 13  Day_of_Week        44418 non-null  int64  
 14  Shldr_Width_Left   44418 non-null  float64
 15  Shldr_Width_Right  44418 non-null  float64
 16  Median_Width       444

In [14]:
crash_df4 = pd.get_dummies(crash_df3, columns=["Rural_Fl", "Cmv_Involv_Fl"])
crash_df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44418 entries, 0 to 56178
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Crash_Date         44418 non-null  int64  
 1   Crash_Time         44418 non-null  int64  
 2   Crash_Speed_Limit  44418 non-null  float64
 3   Wthr_Cond_ID       44418 non-null  float64
 4   Light_Cond_ID      44418 non-null  float64
 5   Surf_Cond_ID       44418 non-null  float64
 6   Traffic_Cntl_ID    44418 non-null  int64  
 7   Harm_Evnt_ID       44418 non-null  int64  
 8   FHE_Collsn_ID      44418 non-null  int64  
 9   Road_Relat_ID      44418 non-null  float64
 10  Crash_Sev_ID       44418 non-null  int64  
 11  Day_of_Week        44418 non-null  int64  
 12  Shldr_Width_Left   44418 non-null  float64
 13  Shldr_Width_Right  44418 non-null  float64
 14  Median_Width       44418 non-null  float64
 15  Nbr_Of_Lane        44418 non-null  float64
 16  Func_Sys_ID        444

In [15]:
crash_df4.head()

Unnamed: 0,Crash_Date,Crash_Time,Crash_Speed_Limit,Wthr_Cond_ID,Light_Cond_ID,Surf_Cond_ID,Traffic_Cntl_ID,Harm_Evnt_ID,FHE_Collsn_ID,Road_Relat_ID,...,DVMT,DTRKVMT,Average_AADT,RLD,PED,INT,Rural_Fl_N,Rural_Fl_Y,Cmv_Involv_Fl_N,Cmv_Involv_Fl_Y
0,735964,0,80.0,11.0,4.0,1.0,20,10,1,2.0,...,107916.7,22407.25,42837.2,1,0,0,0,1,1,0
1,735964,7,80.0,11.0,1.0,1.0,11,7,1,2.0,...,121325.924,25191.47,42837.2,1,0,0,0,1,1,0
2,735966,18,80.0,11.0,4.0,1.0,1,6,1,1.0,...,55703.386,11565.955,42837.2,0,0,0,0,1,1,0
3,735970,9,80.0,5.0,1.0,1.0,20,2,21,1.0,...,3340.359,796.257,29863.8,0,0,0,1,0,1,0
4,735975,16,65.0,11.0,1.0,1.0,5,2,10,1.0,...,1508.94,133.344,39841.0,0,0,1,0,1,1,0


In [16]:
options = [1,2,3,4,5] 
  
# selecting rows based on condition 
crash_df4 = crash_df4[crash_df4['Crash_Sev_ID'].isin(options)] 
crash_df4['Crash_Sev_ID'].value_counts(sort=False)

1     1377
2     7487
3     8943
4      382
5    25337
Name: Crash_Sev_ID, dtype: int64

In [17]:
# Classified variables in two groups, severe and non-severe. Based on subject matter knowledge. 
#1 being incapacitating injuries and 0 being non-incapacitating
# Severity levels of 1,2, and 4 were categorized as being incapacitating (1) and 
#severity levels of 3 & were categorized as non-incapacitating (2).
def sev_groups(series):
    if series == 1 :
        return 1
    elif series == 4 :
        return 1
    elif series == 2 :
        return 1
    elif series == 3 :
        return 0
    elif series == 5 :
        return 0    

crash_df4['Crash_Sev_ID_Bin'] = crash_df4['Crash_Sev_ID'].apply(sev_groups)
crash_df4['Crash_Sev_ID_Bin'].value_counts(sort=False)

0    34280
1     9246
Name: Crash_Sev_ID_Bin, dtype: int64

In [18]:
# With a logistic regression model, there is no preprocessing or scaling required for the data. 
y = crash_df4.Crash_Sev_ID_Bin
# X = crash_df.keep(['Crash_ID','Crash_Fatal_Fl','Cmv_Involv_Fl','Schl_Bus_Fl','Rr_Relat_Fl','Medical_Advisory_Fl'], axis=1)
X = crash_df4.drop(columns=["Crash_Sev_ID_Bin", "Crash_Sev_ID"])
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43526 entries, 0 to 56178
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Crash_Date         43526 non-null  int64  
 1   Crash_Time         43526 non-null  int64  
 2   Crash_Speed_Limit  43526 non-null  float64
 3   Wthr_Cond_ID       43526 non-null  float64
 4   Light_Cond_ID      43526 non-null  float64
 5   Surf_Cond_ID       43526 non-null  float64
 6   Traffic_Cntl_ID    43526 non-null  int64  
 7   Harm_Evnt_ID       43526 non-null  int64  
 8   FHE_Collsn_ID      43526 non-null  int64  
 9   Road_Relat_ID      43526 non-null  float64
 10  Day_of_Week        43526 non-null  int64  
 11  Shldr_Width_Left   43526 non-null  float64
 12  Shldr_Width_Right  43526 non-null  float64
 13  Median_Width       43526 non-null  float64
 14  Nbr_Of_Lane        43526 non-null  float64
 15  Func_Sys_ID        43526 non-null  float64
 16  Trk_Aadt_Pct       435

In [19]:
#corr_matrix=X.corr(method='spearman')
corr_mat= crash_df4.corr(method= 'spearman')

In [21]:
# Export the correlation data set.
tmp=corr_mat.to_csv("C:\Downloads\corr_matrix2.csv", header=True)


# Random Trees Training

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42, stratify=y)
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train= X_scaler.transform(X_train)
X_test= X_scaler.transform(X_test)

In [24]:
np.unique(y_test, return_counts = True)

(array([0, 1]), array([8570, 2312]))

# Random Forest Classifier

In [25]:
# Resample the training data with the RandomForestClassifier
# from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, classification_report

In [26]:
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators= 128, random_state= 42, n_jobs= -1, max_depth= 8,  ) 


In [27]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [28]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [29]:
# Calculating the accuracy score

y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.6496967469215218

In [30]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5928,2642
Actual 1,1170,1142


In [31]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.84      0.69      0.76      8570
           1       0.30      0.49      0.37      2312

    accuracy                           0.65     10882
   macro avg       0.57      0.59      0.57     10882
weighted avg       0.72      0.65      0.68     10882



In [32]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
importances


array([0.06362173, 0.05280884, 0.03044162, 0.02800135, 0.02199969,
       0.02484804, 0.03869815, 0.11158367, 0.10888584, 0.01527571,
       0.02533972, 0.0220545 , 0.02115568, 0.02492194, 0.01209122,
       0.01876467, 0.04158151, 0.02233413, 0.02477879, 0.01635027,
       0.05205235, 0.05158133, 0.0437835 , 0.03263906, 0.05682916,
       0.01147384, 0.00833363, 0.00798619, 0.00508534, 0.00469854])

In [33]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.11158367169401191, 'Harm_Evnt_ID'),
 (0.10888583737160779, 'FHE_Collsn_ID'),
 (0.06362173284550335, 'Crash_Date'),
 (0.05682915579708347, 'PED'),
 (0.05280884188182322, 'Crash_Time'),
 (0.05205235052241965, 'DVMT'),
 (0.0515813252256083, 'DTRKVMT'),
 (0.043783499799249344, 'Average_AADT'),
 (0.041581511093756296, 'Trk_Aadt_Pct'),
 (0.03869815132926837, 'Traffic_Cntl_ID'),
 (0.03263905715280849, 'RLD'),
 (0.0304416152868406, 'Crash_Speed_Limit'),
 (0.028001348854592073, 'Wthr_Cond_ID'),
 (0.02533972202089534, 'Day_of_Week'),
 (0.02492194381986569, 'Median_Width'),
 (0.024848041070889417, 'Surf_Cond_ID'),
 (0.02477878987706085, 'SPD_MAX'),
 (0.022334126135986763, 'Curve_Lngth'),
 (0.022054500380614444, 'Shldr_Width_Left'),
 (0.021999687647489777, 'Light_Cond_ID'),
 (0.021155675993880043, 'Shldr_Width_Right'),
 (0.018764674322999157, 'Func_Sys_ID'),
 (0.016350272693754563, 'LANE_WIDTH'),
 (0.015275711916765028, 'Road_Relat_ID'),
 (0.012091219750605172, 'Nbr_Of_Lane'),
 (0.011473835683

# Easy Ensemble AdaBoost Classifier

In [34]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
clf=  EasyEnsembleClassifier(n_estimators= 128, random_state=1, n_jobs= -1 )
clf.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=128, n_jobs=-1, random_state=1)

In [35]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
predictions = clf.predict(X_test)

balanced_accuracy_score(y_test, predictions)

0.5944173870385548

In [36]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5928,2642
Actual 1,1170,1142


In [37]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.65      0.54      0.73      0.59      0.35      8570
          1       0.29      0.54      0.65      0.38      0.59      0.35      2312

avg / total       0.72      0.63      0.56      0.66      0.59      0.35     10882



# Combination (Over and Under) Sampling

In [38]:
from imblearn.combine import SMOTEENN
from collections import Counter
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 12099, 1: 15355})

In [39]:
# Train the Logistic Regression model using the resampled data
from imblearn.ensemble import EasyEnsembleClassifier 
clf= EasyEnsembleClassifier(n_estimators=128, random_state=42, n_jobs= -1   )
clf.fit(X_resampled, y_resampled)

EasyEnsembleClassifier(n_estimators=128, n_jobs=-1, random_state=42)

In [40]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.49977243179514924

In [41]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 326, 8244],
       [  89, 2223]])

In [42]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      0.04      0.96      0.07      0.19      0.03      8570
          1       0.21      0.96      0.04      0.35      0.19      0.04      2312

avg / total       0.66      0.23      0.77      0.13      0.19      0.03     10882



#  Naive Random Oversampling

In [43]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from collections import Counter
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [44]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 25710, 1: 25710})

In [45]:
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators= 64, random_state= 1, n_jobs= -1, max_depth= 6 )
rf_model.fit(X_resampled, y_resampled)

BalancedRandomForestClassifier(max_depth=6, n_estimators=64, n_jobs=-1,
                               random_state=1)

In [46]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5819208189830947

In [47]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[6171, 2399],
       [1286, 1026]])

In [48]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.72      0.44      0.77      0.57      0.33      8570
          1       0.30      0.44      0.72      0.36      0.57      0.31      2312

avg / total       0.72      0.66      0.50      0.68      0.57      0.32     10882

