In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('data/fars_train.csv')
test_df = pd.read_csv('data/fars_test.csv')

In [3]:
keep_columns = ['u_id', 'fatals', 'a_ped_f', 'a_roll', 'day_week', 
 'a_dow_type', 'a_tod_type', 'a_region', 'a_ru', 'a_intsec', 'a_roadfc', 
 'a_junc', 'a_relrd', 'age', 'pernotmvit', 'a_ped', 'a_body', 'owner', 
 'deaths',  'deformed', 'driver_factor']

reduced_df = train_df[keep_columns]

In [4]:
X = reduced_df.drop(columns=['u_id','driver_factor'])
y = reduced_df['driver_factor']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
X_train.isna().sum()

fatals           0
a_ped_f          0
a_roll           0
day_week         0
a_dow_type       0
a_tod_type     392
a_region         0
a_ru           159
a_intsec        51
a_roadfc         0
a_junc          51
a_relrd          0
age              0
pernotmvit       0
a_ped            0
a_body           0
owner          198
deaths           0
deformed      2649
dtype: int64

In [6]:
#take the mode
X_train['a_tod_type'].fillna(X_train['a_tod_type'].value_counts().index.tolist()[0], inplace=True)
X_test['a_tod_type'].fillna(X_test['a_tod_type'].value_counts().index.tolist()[0], inplace=True)

#take the mode
X_train['a_ru'].fillna(X_train['a_ru'].value_counts().index.tolist()[0], inplace=True)
X_test['a_ru'].fillna(X_test['a_ru'].value_counts().index.tolist()[0], inplace=True)

#take the mode
X_train['a_intsec'].fillna(X_train['a_intsec'].value_counts().index.tolist()[0], inplace=True)
X_test['a_intsec'].fillna(X_test['a_intsec'].value_counts().index.tolist()[0], inplace=True)

#take the mode
X_train['a_junc'].fillna(X_train['a_junc'].value_counts().index.tolist()[0], inplace=True)
X_test['a_junc'].fillna(X_test['a_junc'].value_counts().index.tolist()[0], inplace=True)

#take the mode
X_train['owner'].fillna(X_train['owner'].value_counts().index.tolist()[0], inplace=True)
X_test['owner'].fillna(X_test['owner'].value_counts().index.tolist()[0], inplace=True)

#use a smart imputer
X_train['deformed'].fillna(X_train['deformed'].value_counts().index.tolist()[0], inplace=True)
X_test['deformed'].fillna(X_test['deformed'].value_counts().index.tolist()[0], inplace=True)

In [7]:
X_train.isna().sum()

fatals        0
a_ped_f       0
a_roll        0
day_week      0
a_dow_type    0
a_tod_type    0
a_region      0
a_ru          0
a_intsec      0
a_roadfc      0
a_junc        0
a_relrd       0
age           0
pernotmvit    0
a_ped         0
a_body        0
owner         0
deaths        0
deformed      0
dtype: int64

In [8]:
X_train['deaths'] = np.where(X_train['deaths'] > 0, "Yes", "No") 
X_test['deaths'] = np.where(X_test['deaths'] > 0, "Yes", "No") 

X_train['fatals'] = np.where(X_train['fatals'] > 0, "Yes", "No") 
X_test['fatals'] = np.where(X_test['fatals'] > 0, "Yes", "No") 

In [9]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

In [10]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [11]:
model = CategoricalNB()

model.fit(X_train, y_train)

CategoricalNB()

In [12]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

In [14]:
model.predict_proba(X_train)

array([[8.63174420e-01, 2.57236541e-02, 1.11101926e-01],
       [1.40772087e-06, 9.99989472e-01, 9.12032582e-06],
       [1.86419763e-04, 9.97798300e-01, 2.01528059e-03],
       ...,
       [3.66581977e-01, 3.70404317e-01, 2.63013707e-01],
       [6.68250193e-01, 1.44854332e-02, 3.17264374e-01],
       [2.14559431e-01, 1.26461059e-01, 6.58979511e-01]])

In [13]:
from sklearn.metrics import accuracy_score, classification_report

In [90]:
accuracy_score(y_true = y_test, y_pred=test_preds)

0.5694413777875911

In [91]:
print(classification_report(y_true = y_test, y_pred=test_preds))

                          precision    recall  f1-score   support

   drunk_driver_involved       0.47      0.79      0.59      2337
                   other       0.85      0.55      0.67      5188
speeding_driver_involved       0.27      0.31      0.29      1533

                accuracy                           0.57      9058
               macro avg       0.53      0.55      0.51      9058
            weighted avg       0.65      0.57      0.58      9058

