In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import datetime as dt
import pandas as pd

In [2]:
# Import our input dataset
crash_df = pd.read_csv('FortBendTop10CongestedCrashesAbr.csv')
crash_df.head()

Unnamed: 0,Crash_ID,Crash_Fatal_Fl,Cmv_Involv_Fl,Schl_Bus_Fl,Rr_Relat_Fl,Medical_Advisory_Fl,Amend_Supp_Fl,Active_School_Zone_Fl,Crash_Date,Crash_Time,...,Sus_Serious_Injry_Cnt,Nonincap_Injry_Cnt,Poss_Injry_Cnt,Non_Injry_Cnt,Unkn_Injry_Cnt,Tot_Injry_Cnt,Death_Cnt,MPO_ID,Investigat_Service_ID,Investigat_DA_ID
0,14199980,N,N,N,N,N,N,N,1/1/2015,9:12,...,0,0,1,5,0,1,0,15,54,24
1,14201647,N,N,N,N,N,N,N,1/1/2015,14:15,...,0,0,0,1,0,0,0,15,54,24
2,14202855,N,N,N,N,N,N,N,1/2/2015,21:32,...,0,0,0,1,1,0,0,15,54,24
3,14203076,N,N,N,N,N,N,N,1/3/2015,2:00,...,0,0,0,1,1,0,0,15,54,24
4,14206356,N,N,N,N,N,N,N,1/3/2015,21:00,...,0,0,0,1,1,0,0,15,54,24


In [3]:
# crash_df.dtypes
# print(crash_df['Crash_Time'].dtypes)
# print(crash_df['day_num'])

In [4]:
crash_df['Crash_Date'] = pd.to_datetime(crash_df['Crash_Date'])
crash_df['Crash_Time'] = pd.to_datetime(crash_df['Crash_Time']).dt.hour
crash_df.head()

Unnamed: 0,Crash_ID,Crash_Fatal_Fl,Cmv_Involv_Fl,Schl_Bus_Fl,Rr_Relat_Fl,Medical_Advisory_Fl,Amend_Supp_Fl,Active_School_Zone_Fl,Crash_Date,Crash_Time,...,Sus_Serious_Injry_Cnt,Nonincap_Injry_Cnt,Poss_Injry_Cnt,Non_Injry_Cnt,Unkn_Injry_Cnt,Tot_Injry_Cnt,Death_Cnt,MPO_ID,Investigat_Service_ID,Investigat_DA_ID
0,14199980,N,N,N,N,N,N,N,2015-01-01,9,...,0,0,1,5,0,1,0,15,54,24
1,14201647,N,N,N,N,N,N,N,2015-01-01,14,...,0,0,0,1,0,0,0,15,54,24
2,14202855,N,N,N,N,N,N,N,2015-01-02,21,...,0,0,0,1,1,0,0,15,54,24
3,14203076,N,N,N,N,N,N,N,2015-01-03,2,...,0,0,0,1,1,0,0,15,54,24
4,14206356,N,N,N,N,N,N,N,2015-01-03,21,...,0,0,0,1,1,0,0,15,54,24


In [5]:
y_n = {
   "Y": 1,
   "N": 0,
}

In [6]:
crash_df["At_Intrsct_Fl"] = crash_df["At_Intrsct_Fl"].apply(lambda x: y_n[x])

In [7]:
days_num = {
   "SUN": 1,
   "MON": 2,
   "TUE": 3,
   "WED": 4,
   "THU": 5,
   "FRI": 6,
   "SAT": 7,
}

In [8]:
crash_df["Day_of_Week"] = crash_df["Day_of_Week"].apply(lambda x: days_num[x])

In [9]:
options = [1,2,3,4,5] 
  
# selecting rows based on condition 
crash_df = crash_df[crash_df['Crash_Sev_ID'].isin(options)] 
# crash_df['Crash_Sev_ID'].value_counts(sort=False)

In [10]:
def sev_groups(series):
    if series == 1 :
        return 1
    elif series == 4 :
        return 1
    elif series == 2 :
        return 1
    elif series == 3 :
        return 0
    elif series == 5 :
        return 0    

crash_df['Crash_Sev_ID_Bin'] = crash_df['Crash_Sev_ID'].apply(sev_groups)

crash_df['Crash_Sev_ID_Bin'].value_counts(sort=False)

0    458
1     31
Name: Crash_Sev_ID_Bin, dtype: int64

In [11]:
# With a logistic regression model, there is no preprocessing or scaling required for the data. 
# Remove diabetes outcome target from features data
y = crash_df.Crash_Sev_ID_Bin
# X = crash_df.keep(['Crash_ID','Crash_Fatal_Fl','Cmv_Involv_Fl','Schl_Bus_Fl','Rr_Relat_Fl','Medical_Advisory_Fl'], axis=1)
X = crash_df.drop(crash_df.columns.difference([
'Rpt_CRIS_Cnty_ID',
'Rpt_City_ID',
'Crash_Speed_Limit',
'At_Intrsct_Fl',
'Wthr_Cond_ID',
'Light_Cond_ID',
'Entr_Road_ID',
'Road_Type_ID',
'Road_Algn_ID',
'Surf_Cond_ID',
'Traffic_Cntl_ID',
'Bridge_Detail_ID',
'Intrsct_Relat_ID',
'FHE_Collsn_ID',
'Obj_Struck_ID',
'Othr_Factr_ID',
'Road_Part_Adj_ID',
'Road_Cls_ID',
'Road_Relat_ID',
'Phys_Featr_1_ID',
'Phys_Featr_2_ID',
'Cnty_ID',
'City_ID',
'Crash_Time',    
'Latitude',
'Longitude',
'Dfo',
'Ref_Mark_Nbr',
'Ref_Mark_Displ',
'Day_of_Week',
'Hwy_Dsgn_Lane_ID',
'Hwy_Dsgn_Hrt_ID',
'Hp_Shldr_Left',
'Hp_Shldr_Right',
'Hp_Median_Width',
'Base_Type_ID',
'Nbr_Of_Lane',
'Row_Width_Usual',
'Roadbed_Width',
'Surf_Width',
'Surf_Type_ID',
'Curb_Type_Left_ID',
'Curb_Type_Right_ID',
'Shldr_Type_Left_ID',
'Shldr_Width_Left',
'Shldr_Use_Left_ID',
'Shldr_Type_Right_ID',
'Shldr_Width_Right',
'Shldr_Use_Right_ID']), axis=1)
# print(y)

In [12]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [13]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=500)

In [14]:
# Train the model
log_classifier.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=500)

In [15]:
# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.943
