In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import datetime as dt
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder

In [7]:
# Import our input dataset
crash_df = pd.read_csv('CrashRoadwayData.csv')
crash_df.head()

Unnamed: 0,Crash_ID,Cmv_Involv_Fl,Crash_Date,Crash_Time,Crash_Speed_Limit,Wthr_Cond_ID,Light_Cond_ID,Surf_Cond_ID,Traffic_Cntl_ID,Harm_Evnt_ID,...,HY_1,HY_2,HY_3,HY_4,Corridor_ID,Road_ID,Average_AADT,RLD,PED,INT
0,14839123,N,01/01/2016,12:23 AM,80,11,4,1,20,10,...,43413.0,43788.0,42315.0,38748.0,28.0,1188.0,42837.2,1,0,0
1,14840287,N,01/01/2016,07:40 AM,80,11,1,1,11,7,...,43413.0,43788.0,42315.0,38748.0,28.0,1192.0,42837.2,1,0,0
2,14840860,N,01/03/2016,06:48 PM,80,11,4,1,1,6,...,43413.0,43788.0,42315.0,38748.0,28.0,1194.0,42837.2,0,0,0
3,14850383,N,01/07/2016,09:05 AM,80,5,1,1,20,2,...,31038.0,30351.0,28878.0,25311.0,28.0,1207.0,29863.8,0,0,0
4,14863190,N,01/12/2016,04:01 PM,65,11,1,1,5,2,...,45033.0,45013.0,43020.0,39274.0,28.0,1179.0,43969.0,0,0,1


In [8]:
# Abridge our crash data set
# Although these variables may help predict, they were dropped due to missing data: Curve_Lngth, Cd_Degr, Curve_Type_ID
crash_df2 = crash_df.drop(crash_df.columns.difference([
'Crash_Speed_Limit',
'Crash_Time',
'Crash_Date',
'Wthr_Cond_ID',
'Light_Cond_ID',
'Surf_Cond_ID',
'Traffic_Cntl_ID',
'Intrsct_Relat_ID',
'Latitude',
'Longitude',
'Hwy_Nbr',
'Dfo',
'Rural_Fl',
'Crash_Sev_ID',
'Day_of_Week',
'Shldr_Width_Left',
'Shldr_Width_Right',
'Median_Width',
'Nbr_Of_Lane',
'Adt_Curnt_Amt',
'Trk_Aadt_Pct',
'Corridor_ID',
'Road_ID',
'Average_AADT',
'RLD',    
'PED',  
'INT']), axis=1)
# crash_df2.head()
crash_df2.dtypes
# frequency = crash_df2['Crash_Speed_Limit'].value_counts()
# print(frequency)

Crash_Date            object
Crash_Time            object
Crash_Speed_Limit      int64
Wthr_Cond_ID           int64
Light_Cond_ID          int64
Surf_Cond_ID           int64
Traffic_Cntl_ID        int64
Intrsct_Relat_ID       int64
Latitude             float64
Longitude            float64
Hwy_Nbr              float64
Dfo                  float64
Rural_Fl              object
Crash_Sev_ID           int64
Day_of_Week           object
Shldr_Width_Left     float64
Shldr_Width_Right    float64
Median_Width         float64
Nbr_Of_Lane          float64
Adt_Curnt_Amt        float64
Trk_Aadt_Pct         float64
Corridor_ID          float64
Road_ID              float64
Average_AADT         float64
RLD                    int64
PED                    int64
INT                    int64
dtype: object

In [9]:
# Replace unreasonable or missing data. Follow up. also change to function. 
crash_df3 = crash_df2.copy() 
crash_df3.loc[crash_df3['Crash_Speed_Limit'] <= 14,'Crash_Speed_Limit'] = np.nan
crash_df3.loc[crash_df3['Wthr_Cond_ID'] < 1,'Wthr_Cond_ID'] = np.nan
crash_df3.loc[crash_df3['Light_Cond_ID'] < 1,'Light_Cond_ID'] = np.nan
crash_df3.loc[crash_df3['Surf_Cond_ID'] < 1,'Surf_Cond_ID'] = np.nan
crash_df3['Hwy_Nbr'] = crash_df3['Hwy_Nbr'].astype(object)
# frequency = crash_df3['Surf_Cond_ID'].value_counts()
# print(frequency)
# print(crash_df3['Crash_Speed_Limit'])
crash_df3.info()
# crash_df3.head()
# print(crash_df3['Hwy_Nbr'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58050 entries, 0 to 58049
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Crash_Date         58050 non-null  object 
 1   Crash_Time         58050 non-null  object 
 2   Crash_Speed_Limit  45913 non-null  float64
 3   Wthr_Cond_ID       57917 non-null  float64
 4   Light_Cond_ID      57949 non-null  float64
 5   Surf_Cond_ID       57930 non-null  float64
 6   Traffic_Cntl_ID    58050 non-null  int64  
 7   Intrsct_Relat_ID   58050 non-null  int64  
 8   Latitude           58050 non-null  float64
 9   Longitude          58050 non-null  float64
 10  Hwy_Nbr            58050 non-null  object 
 11  Dfo                58050 non-null  float64
 12  Rural_Fl           58050 non-null  object 
 13  Crash_Sev_ID       58050 non-null  int64  
 14  Day_of_Week        58050 non-null  object 
 15  Shldr_Width_Left   58050 non-null  float64
 16  Shldr_Width_Right  580

In [10]:
# We'll drop NaN for now to see if the model works with the remaining rows. 
crash_df3 = crash_df3.dropna()

In [11]:
crash_df3['Crash_Date'] = pd.to_datetime(crash_df3['Crash_Date'])
crash_df3['Crash_Date'] = crash_df3['Crash_Date'].apply(lambda x: x.toordinal())
crash_df3['Crash_Time'] = pd.to_datetime(crash_df3['Crash_Time']).dt.hour
crash_df3.head()

Unnamed: 0,Crash_Date,Crash_Time,Crash_Speed_Limit,Wthr_Cond_ID,Light_Cond_ID,Surf_Cond_ID,Traffic_Cntl_ID,Intrsct_Relat_ID,Latitude,Longitude,...,Median_Width,Nbr_Of_Lane,Adt_Curnt_Amt,Trk_Aadt_Pct,Corridor_ID,Road_ID,Average_AADT,RLD,PED,INT
0,735964,0,80.0,11.0,4.0,1.0,20,4,30.295627,-97.57089,...,135.0,4.0,39424.0,17.9,28.0,1188.0,42837.2,1,0,0
1,735964,7,80.0,11.0,1.0,1.0,11,4,30.256238,-97.602652,...,135.0,4.0,39424.0,17.9,28.0,1192.0,42837.2,1,0,0
2,735966,18,80.0,11.0,4.0,1.0,1,4,30.22697,-97.621285,...,135.0,4.0,43413.0,17.5,28.0,1194.0,42837.2,0,0,0
3,735970,9,80.0,5.0,1.0,1.0,20,4,30.176453,-97.63656,...,135.0,4.0,31038.0,20.5,28.0,1207.0,29863.8,0,0,0
4,735975,16,65.0,11.0,1.0,1.0,5,1,30.352402,-97.592262,...,135.0,4.0,39424.0,17.9,28.0,1179.0,43969.0,0,0,1


In [12]:
days_num = {
   "SUN": 1,
   "MON": 2,
   "TUE": 3,
   "WED": 4,
   "THU": 5,
   "FRI": 6,
   "SAT": 7,
}

In [13]:
crash_df3["Day_of_Week"] = crash_df3["Day_of_Week"].apply(lambda x: days_num[x])

In [14]:
le = LabelEncoder()
crash_df3 = crash_df3.copy()
crash_df3['Hwy_Nbr'] = le.fit_transform(crash_df3['Hwy_Nbr'])
# crash_df3.head()
# print(crash_df3['Hwy_Nbr'])
crash_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45792 entries, 0 to 58048
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Crash_Date         45792 non-null  int64  
 1   Crash_Time         45792 non-null  int64  
 2   Crash_Speed_Limit  45792 non-null  float64
 3   Wthr_Cond_ID       45792 non-null  float64
 4   Light_Cond_ID      45792 non-null  float64
 5   Surf_Cond_ID       45792 non-null  float64
 6   Traffic_Cntl_ID    45792 non-null  int64  
 7   Intrsct_Relat_ID   45792 non-null  int64  
 8   Latitude           45792 non-null  float64
 9   Longitude          45792 non-null  float64
 10  Hwy_Nbr            45792 non-null  int32  
 11  Dfo                45792 non-null  float64
 12  Rural_Fl           45792 non-null  object 
 13  Crash_Sev_ID       45792 non-null  int64  
 14  Day_of_Week        45792 non-null  int64  
 15  Shldr_Width_Left   45792 non-null  float64
 16  Shldr_Width_Right  457

In [15]:
crash_df4 = pd.get_dummies(crash_df3, columns=["Rural_Fl"])
crash_df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45792 entries, 0 to 58048
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Crash_Date         45792 non-null  int64  
 1   Crash_Time         45792 non-null  int64  
 2   Crash_Speed_Limit  45792 non-null  float64
 3   Wthr_Cond_ID       45792 non-null  float64
 4   Light_Cond_ID      45792 non-null  float64
 5   Surf_Cond_ID       45792 non-null  float64
 6   Traffic_Cntl_ID    45792 non-null  int64  
 7   Intrsct_Relat_ID   45792 non-null  int64  
 8   Latitude           45792 non-null  float64
 9   Longitude          45792 non-null  float64
 10  Hwy_Nbr            45792 non-null  int32  
 11  Dfo                45792 non-null  float64
 12  Crash_Sev_ID       45792 non-null  int64  
 13  Day_of_Week        45792 non-null  int64  
 14  Shldr_Width_Left   45792 non-null  float64
 15  Shldr_Width_Right  45792 non-null  float64
 16  Median_Width       457

In [16]:
options = [1,2,3,4,5] 
  
# selecting rows based on condition 
crash_df4 = crash_df4[crash_df4['Crash_Sev_ID'].isin(options)] 
crash_df4['Crash_Sev_ID'].value_counts(sort=False)

1     1420
2     7769
3     9254
4      393
5    26044
Name: Crash_Sev_ID, dtype: int64

In [17]:
def sev_groups(series):
    if series == 1 :
        return 1
    elif series == 4 :
        return 1
    elif series == 2 :
        return 1
    elif series == 3 :
        return 0
    elif series == 5 :
        return 0    

crash_df4['Crash_Sev_ID_Bin'] = crash_df4['Crash_Sev_ID'].apply(sev_groups)
crash_df4['Crash_Sev_ID_Bin'].value_counts(sort=False)

0    35298
1     9582
Name: Crash_Sev_ID_Bin, dtype: int64

In [18]:
# With a logistic regression model, there is no preprocessing or scaling required for the data. 
# Remove diabetes outcome target from features data
y = crash_df4.Crash_Sev_ID_Bin
# X = crash_df.keep(['Crash_ID','Crash_Fatal_Fl','Cmv_Involv_Fl','Schl_Bus_Fl','Rr_Relat_Fl','Medical_Advisory_Fl'], axis=1)
X = crash_df4.drop(columns="Crash_Sev_ID_Bin")
crash_df4.head()

Unnamed: 0,Crash_Date,Crash_Time,Crash_Speed_Limit,Wthr_Cond_ID,Light_Cond_ID,Surf_Cond_ID,Traffic_Cntl_ID,Intrsct_Relat_ID,Latitude,Longitude,...,Trk_Aadt_Pct,Corridor_ID,Road_ID,Average_AADT,RLD,PED,INT,Rural_Fl_N,Rural_Fl_Y,Crash_Sev_ID_Bin
0,735964,0,80.0,11.0,4.0,1.0,20,4,30.295627,-97.57089,...,17.9,28.0,1188.0,42837.2,1,0,0,0,1,1
1,735964,7,80.0,11.0,1.0,1.0,11,4,30.256238,-97.602652,...,17.9,28.0,1192.0,42837.2,1,0,0,0,1,0
2,735966,18,80.0,11.0,4.0,1.0,1,4,30.22697,-97.621285,...,17.5,28.0,1194.0,42837.2,0,0,0,0,1,0
3,735970,9,80.0,5.0,1.0,1.0,20,4,30.176453,-97.63656,...,20.5,28.0,1207.0,29863.8,0,0,0,1,0,0
4,735975,16,65.0,11.0,1.0,1.0,5,1,30.352402,-97.592262,...,17.9,28.0,1179.0,43969.0,0,0,1,0,1,1


In [19]:
# # Export the final data set. 
# tmp=crash_df4.to_csv("C:\Temp\Test9999.csv", header=True)

In [20]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [21]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs", max_iter=200)

In [22]:
# Train the model
log_classifier.fit(X_train,y_train)

LogisticRegression(max_iter=200)

In [23]:
# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.787
