# Multinomial Logit Model

### Importing Packages

In [35]:
#Importing Packages
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns

### Importing Dataset

In [36]:
#importing dataset after ETL and EDA.
df = pd.read_csv('after_cleaning.csv')
df.head(2)

Unnamed: 0,police_station,accident_date,accident_weekday,accident_time,lattitude,longitude,accused_vehicle,victim_vehicle,victim_alcohol_status,accused_alcohol_status,death_count,injured_count,month,hour,severity
0,KPHB Colony,1/1/2019,Tuesday,0:00:00,17.47278,78.38764,bike,object,No,No,1,0,1,0,Fatal Injuries
1,Pet Basheerabad,2/1/2019,Wednesday,21:15:00,17.54128,78.49104,Unknown,Unknown,No,No,0,1,1,21,Grievous Injuries


### Data Preprocessing

In [37]:
#Dropping the useless attributes from dataframe and converting the alcohol status to bool type.
df = df.drop(df[['police_station','accident_date','accident_time','hour','lattitude','longitude','victim_alcohol_status','accused_alcohol_status']],axis=1)
#df['victim_alcohol_status'] = df['victim_alcohol_status'] == "Yes"
#df['accused_alcohol_status'] = df['accused_alcohol_status'] == "Yes"
df.head(2)

Unnamed: 0,accident_weekday,accused_vehicle,victim_vehicle,death_count,injured_count,month,severity
0,Tuesday,bike,object,1,0,1,Fatal Injuries
1,Wednesday,Unknown,Unknown,0,1,1,Grievous Injuries


In [38]:
#Encoding the categorical data
#Adding all object type columns to categorical data except 'police_station', 'accident_time', 'month', 'hour' as these have no necessity of encoding.
categorical = df.dtypes == 'object'
categorical_cols = df.columns[categorical].tolist()
categorical_data = [i for i in categorical_cols if i not in ['police_station','accident_time','month','hour','victim_alcohol_status','accused_alcohol_status','severity']]

#Importing LabelEncoder to encode the categorical data.
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
#Transforming the categoricals into encoded data.
df[categorical_data] = df[categorical_data].apply(lambda col: l.fit_transform(col))

In [39]:
#Splitting data into response and predictors.
Y = df['severity']
X = df.drop(['severity','month'],1)

In [40]:
import statsmodels.api as st
X = st.add_constant(X, prepend = False)

In [41]:
#Splitting the response and predictors into train and test data.
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [42]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1744 entries, 2208 to 1653
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   accident_weekday  1744 non-null   int32  
 1   accused_vehicle   1744 non-null   int32  
 2   victim_vehicle    1744 non-null   int32  
 3   death_count       1744 non-null   int64  
 4   injured_count     1744 non-null   int64  
 5   const             1744 non-null   float64
dtypes: float64(1), int32(3), int64(2)
memory usage: 74.9 KB


### Model Fit

In [43]:
y_train.unique()

array(['No Injuries', 'Grievous Injuries', 'Simple Injuries',
       'Fatal Injuries'], dtype=object)

In [44]:
#Fitting Multinomial Logit model
import statsmodels.api as sm
logit_model=sm.MNLogit(y_train,X_train.astype('float'))
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.275281
         Iterations: 35
         Function evaluations: 38
         Gradient evaluations: 38


  warn("Maximum Likelihood optimization failed to converge. "


                         Results: MNLogit
Model:               MNLogit           Pseudo R-squared:  0.761    
Dependent Variable:  severity          AIC:               996.1785 
Date:                2020-05-11 10:53  BIC:               1094.5293
No. Observations:    1744              Log-Likelihood:    -480.09  
Df Model:            15                LL-Null:           -2010.8  
Df Residuals:        1726              LLR p-value:       0.0000   
Converged:           0.0000            Scale:             1.0000   
-------------------------------------------------------------------
   severity = 0    Coef.   Std.Err.    t    P>|t|   [0.025   0.975]
-------------------------------------------------------------------
accident_weekday    0.6859   0.1939  3.5368 0.0004   0.3058  1.0661
 accused_vehicle   -0.2658   0.1537 -1.7293 0.0838  -0.5671  0.0355
  victim_vehicle    0.1070   0.0942  1.1359 0.2560  -0.0776  0.2917
     death_count  -12.5791   2.3196 -5.4230 0.0000 -17.1253 -8.0328
   inj

In [45]:
print(result.summary())

                          MNLogit Regression Results                          
Dep. Variable:               severity   No. Observations:                 1744
Model:                        MNLogit   Df Residuals:                     1726
Method:                           MLE   Df Model:                           15
Date:                Mon, 11 May 2020   Pseudo R-squ.:                  0.7612
Time:                        10:53:13   Log-Likelihood:                -480.09
converged:                      False   LL-Null:                       -2010.8
Covariance Type:            nonrobust   LLR p-value:                     0.000
severity=Grievous Injuries       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
accident_weekday               0.6859      0.194      3.537      0.000       0.306       1.066
accused_vehicle               -0.2658      0.154     -1.729      0.084      -0.567 

In [20]:
res_mar = result.get_margeff()
res_mar.summary()

0,1
Dep. Variable:,severity
Method:,dydx
At:,overall

severity=Fatal Injuries,dy/dx,std err,z,P>|z|,[0.025,0.975]
accident_weekday,-0.0044,0.002,-2.478,0.013,-0.008,-0.001
accused_vehicle,0.0015,0.001,1.318,0.188,-0.001,0.004
victim_vehicle,-0.0012,0.001,-1.509,0.131,-0.003,0.000
death_count,0.1187,0.029,4.077,0.000,0.062,0.176
injured_count,-0.0128,0.010,-1.281,0.200,-0.032,0.007
severity=Grievous Injuries,dy/dx,std err,z,P>|z|,[0.025,0.975]
accident_weekday,0.0072,0.003,2.275,0.023,0.001,0.013
accused_vehicle,-0.0040,0.004,-0.946,0.344,-0.012,0.004
victim_vehicle,-0.0019,0.002,-1.062,0.288,-0.005,0.002
death_count,0.1998,0.100,2.006,0.045,0.005,0.395


-------------------------------------------------------------------------------------------------------------------------------

# Multinomial Logistic Regression Model

### Importing Packages

In [10]:
# Required Python Packages
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split

#import plotly.graph_objs as go
#import chart_studio.plotly as py
#from plotly.graph_objs import *

### Importing Dataset

In [11]:
#importing the dataset after ETL and EDA.
df = pd.read_csv('after_cleaning.csv')
df.head(2)

Unnamed: 0,police_station,accident_date,accident_weekday,accident_time,lattitude,longitude,accused_vehicle,victim_vehicle,victim_alcohol_status,accused_alcohol_status,death_count,injured_count,month,hour,severity
0,KPHB Colony,1/1/2019,Tuesday,0:00:00,17.47278,78.38764,bike,object,No,No,1,0,1,0,Fatal Injuries
1,Pet Basheerabad,2/1/2019,Wednesday,21:15:00,17.54128,78.49104,Unknown,Unknown,No,No,0,1,1,21,Grievous Injuries


### Data Preprocessing

In [12]:
df = df.drop(df[['police_station','accident_date','accident_time','lattitude','longitude']],axis=1)
df['victim_alcohol_status'] = df['victim_alcohol_status'] == "Yes"
df['accused_alcohol_status'] = df['accused_alcohol_status'] == "Yes"
df.head(2)

Unnamed: 0,accident_weekday,accused_vehicle,victim_vehicle,victim_alcohol_status,accused_alcohol_status,death_count,injured_count,month,hour,severity
0,Tuesday,bike,object,False,False,1,0,1,0,Fatal Injuries
1,Wednesday,Unknown,Unknown,False,False,0,1,1,21,Grievous Injuries


In [13]:
#Encoding the categorical data
#Adding all object type columns to categorical data except 'police_station', 'accident_time', 'month', 'hour' as these have no necessity of encoding.
categorical = df.dtypes == 'object'
categorical_cols = df.columns[categorical].tolist()
categorical_data = [i for i in categorical_cols if i not in ['police_station','accident_time','month','hour','victim_alcohol_status','accused_alcohol_status']]

#Importing LabelEncoder to encode the categorical data.
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
#Transforming the categoricals into encoded data.
df[categorical_data] = df[categorical_data].apply(lambda col: l.fit_transform(col))

In [14]:
Y = df['severity']
X = df.drop(['severity'],1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

### Fitting Model

In [15]:
#Fitting logistic regression from Linear_model using multinomial as multi-class and newwton-cg as Solver.
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train)

#Let us understand the accuracy of Train and Test data Predictions.
print("Multinomial Logistic regression Train Accuracy = ", metrics.accuracy_score(y_train, mul_lr.predict(X_train)))
print("Multinomial Logistic regression Test Accuracy = ", metrics.accuracy_score(y_test, mul_lr.predict(X_test)))

#Let us understand about the correlation and accuracy
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score
y_pred = mul_lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy_score = ',accuracy_score(y_test, y_pred))

Multinomial Logistic regression Train Accuracy =  0.9157110091743119
Multinomial Logistic regression Test Accuracy =  0.8997326203208557
[[165   0   2   1]
 [  0   0   0  71]
 [  0   0 107   0]
 [  0   0   1 401]]
accuracy_score =  0.8997326203208557


In [16]:
#Finding the Prediction Probabilities
mul_lr.predict_proba(X_test)

array([[9.22701576e-03, 1.30741614e-01, 5.01714612e-03, 8.55014224e-01],
       [9.88702955e-01, 3.75447655e-03, 1.45976333e-05, 7.52797043e-03],
       [1.21675799e-02, 1.28716848e-01, 6.16192839e-03, 8.52953644e-01],
       ...,
       [1.18020118e-02, 1.03101277e-01, 7.75690422e-03, 8.77339807e-01],
       [4.61316028e-03, 7.05655818e-02, 1.01961747e-05, 9.24811062e-01],
       [1.21307378e-02, 1.36200892e-01, 4.75908487e-03, 8.46909285e-01]])