In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import datetime as dt
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from imblearn.metrics import classification_report_imbalanced
from bioinfokit import visuz

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# Connect posgres engine
engine = create_engine('postgresql+psycopg2://dylankurth:postgres@localhost:5432/crash_analysis')

In [None]:
# Read database table into pandas df
crash_df_1 = pd.read_sql_table('Accident', engine)
crash_df_1

In [None]:
# Read database table into pandas df
crash_df_2 = pd.read_sql_table('Accident_Conditions', engine)
crash_df_2

In [None]:
# Read database table into pandas df
crash_df_3 = pd.read_sql_table('Accident_Injury', engine)
crash_df_3

In [None]:
# Read database table into pandas df
crash_df_4 = pd.read_sql_table('Accident_Location', engine)
crash_df_4

In [None]:
crash_df_5 = pd.read_sql_table('Accident_Vehicle', engine)
crash_df_5

In [None]:
crash_df_6 = pd.read_sql_table('Accident_RINO', engine)
crash_df_6

In [None]:
# Merge tables into one dataframe
crash_df = pd.concat([crash_df_1, crash_df_2, crash_df_3, crash_df_4, crash_df_5], axis=1)
crash_df

In [None]:
# list columns
crash_df.columns.to_list()

In [None]:
# Although these variables may help predict, they were dropped due to missing data: Curve_Lngth, Cd_Degr, Curve_Type_ID
crash_df2 = crash_df.drop(crash_df.columns.difference([
'Crash_Speed_Limit',
'Crash_Time',
'Crash_Date',
'Wthr_Cond_ID',
'Light_Cond_ID',
'Surf_Cond_ID',
'Traffic_Cntl_ID',
'Rural_Fl',
'Crash_Sev_ID',
'Day_of_Week',
'Shldr_Width_Left',
'Shldr_Width_Right',
'Median_Width',
'Nbr_Of_Lane',
'Trk_Aadt_Pct',
'Average_AADT',      
'FHE_Collsn_ID',
'Cmv_Involv_Fl',
'PED',
'RLD',
'INT',
'DVMT',
'DTRKVMT',
'LANE_WIDTH',
'SPD_MAX',
'Curve_Lngth',
'Func_Sys_ID',
'Harm_Evnt_ID',
'Road_Relat_ID']), axis=1)

crash_df2.dtypes

In [None]:
# Check for Na
crash_df2.isna().sum()

In [None]:
# Removed due to null values
#crash_df2['Curve_Lngth'] = crash_df2['Curve_Lngth'].replace(np.nan, 0)

In [None]:
crash_df2.isna().sum()

In [None]:
# Replace unreasonable or missing data. Follow up. also change to function. 
crash_df3 = crash_df2.copy() 
crash_df3.loc[crash_df3['Crash_Speed_Limit'] <= 14,'Crash_Speed_Limit'] = np.nan
crash_df3.loc[crash_df3['Wthr_Cond_ID'] < 1,'Wthr_Cond_ID'] = np.nan
crash_df3.loc[crash_df3['Light_Cond_ID'] < 1,'Light_Cond_ID'] = np.nan
crash_df3.loc[crash_df3['Surf_Cond_ID'] < 1,'Surf_Cond_ID'] = np.nan
# crash_df3['Hwy_Nbr'] = crash_df3['Hwy_Nbr'].astype(object)
# frequency = crash_df3['Surf_Cond_ID'].value_counts()
# print(frequency)
# print(crash_df3['Crash_Speed_Limit'])
crash_df3.info()
# crash_df3.head()
# print(crash_df3['Hwy_Nbr'])

In [None]:
# We'll drop NaN for now to see if the model works with the remaining rows. 
crash_df3 = crash_df3.dropna()

In [None]:
crash_df3.dtypes

In [None]:

crash_df3['Crash_Date'] = pd.to_datetime(crash_df3['Crash_Date'])
crash_df3['Crash_Date'] = crash_df3['Crash_Date'].apply(lambda x: x.toordinal())
crash_df3

In [None]:
crash_df3['Crash_Date'].dtypes

In [None]:
crash_df3['Crash_Time'] = pd.to_datetime(crash_df3['Crash_Time']).dt.hour

In [None]:

days_num = {
   "SUN": 1,
   "MON": 2,
   "TUE": 3,
   "WED": 4,
   "THU": 5,
   "FRI": 6,
   "SAT": 7,
}

In [None]:
crash_df3["Day_of_Week"] = crash_df3["Day_of_Week"].apply(lambda x: days_num[x])

In [None]:
le = LabelEncoder()
crash_df3 = crash_df3.copy()
# crash_df3['Hwy_Nbr'] = le.fit_transform(crash_df3['Hwy_Nbr'])
# crash_df3.head()
# print(crash_df3['Hwy_Nbr'])
crash_df3.info()

In [None]:
crash_df4 = pd.get_dummies(crash_df3, columns=["Rural_Fl", "Cmv_Involv_Fl"])
crash_df4.info()

In [None]:
crash_df4.head()

In [None]:
options = [1,2,3,4,5] 
  
# selecting rows based on condition 
crash_df4 = crash_df4[crash_df4['Crash_Sev_ID'].isin(options)] 
crash_df4['Crash_Sev_ID'].value_counts(sort=False)

In [None]:
# Classified variables in two groups, severe and non-severe. Based on subject matter knowledge. 
#1 being incapacitating injuries and 0 being non-incapacitating
# Severity levels of 1,2, and 4 were categorized as being incapacitating (1) and 
#severity levels of 3 & were categorized as non-incapacitating (2).
def sev_groups(series):
    if series == 1 :
        return 1
    elif series == 4 :
        return 1
    elif series == 2 :
        return 1
    elif series == 3 :
        return 0
    elif series == 5 :
        return 0    

crash_df4['Crash_Sev_ID_Bin'] = crash_df4['Crash_Sev_ID'].apply(sev_groups)
crash_df4['Crash_Sev_ID_Bin'].value_counts(sort=False)

In [None]:
# With a logistic regression model, there is no preprocessing or scaling required for the data. 
y = crash_df4.Crash_Sev_ID_Bin
# X = crash_df.keep(['Crash_ID','Crash_Fatal_Fl','Cmv_Involv_Fl','Schl_Bus_Fl','Rr_Relat_Fl','Medical_Advisory_Fl'], axis=1)
X = crash_df4.drop(columns=["Crash_Sev_ID_Bin", "Crash_Sev_ID"])
X.info()

In [None]:
#corr_matrix=X.corr(method='spearman')
corr_mat= crash_df4.corr(method= 'spearman')

In [None]:
# Export the correlation data set.
tmp=corr_mat.to_csv("C:\Downloads\corr_matrix2.csv", header=True)

In [None]:
###Random Trees Training

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42, stratify=y)
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train= X_scaler.transform(X_train)
X_test= X_scaler.transform(X_test)

In [None]:
np.unique(y_test, return_counts = True)

In [None]:
### Random Forest Classifier

In [None]:
# Resample the training data with the RandomForestClassifier
# from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, classification_report

In [None]:
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators= 128, random_state= 42, n_jobs= -1, max_depth= 8,  )

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [None]:
# Calculating the accuracy score

y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

In [None]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
importances


In [None]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
### Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
clf=  EasyEnsembleClassifier(n_estimators= 128, random_state=1, n_jobs= -1 )
clf.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
predictions = clf.predict(X_test)

balanced_accuracy_score(y_test, predictions)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, predictions))

In [None]:
### Combination (Over and Under) Sampling

In [None]:
from imblearn.combine import SMOTEENN
from collections import Counter
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from imblearn.ensemble import EasyEnsembleClassifier 
clf= EasyEnsembleClassifier(n_estimators=128, random_state=42, n_jobs= -1   )
clf.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

In [None]:
### Naive Random Oversampling

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from collections import Counter
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:

# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

In [None]:
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators= 64, random_state= 1, n_jobs= -1, max_depth= 6 )
rf_model.fit(X_resampled, y_resampled)


In [None]:
from sklearn.metrics import balanced_accuracy_score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))