In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [47]:
data = pd.read_csv('../data/motor_vehicle_collisions_crashes.csv', low_memory=False)

In [50]:
data['CRASH HOUR'] = pd.Series(datetime.strptime(time, '%H:%M').hour for time in data['CRASH TIME'])

In [68]:
columns = ['CRASH HOUR', 'BOROUGH', 'CONTRIBUTING FACTOR VEHICLE 1']
df = data[columns]

In [69]:
for col in columns:
    df = df[df[col].notna()]

df = df.loc[df['CONTRIBUTING FACTOR VEHICLE 1'] != 'UNSPECIFIED']

In [70]:
features = df.columns.tolist()
clas = 'CONTRIBUTING FACTOR VEHICLE 1'
features.remove(clas)

In [85]:
classes = df[clas].value_counts().index[:10].tolist()
df = df.loc[df[clas].isin(classes)]

In [86]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [87]:
encoder = LabelEncoder()
for feature in features:
    encoder.fit(df[feature])
    df[feature] = encoder.transform(df[feature])

target_encoded = encoder.fit_transform(df[clas])
df[clas] = target_encoded
df[clas].value_counts()

1    232790
2     78072
0     54167
5     41467
4     40700
7     30577
9     30054
6     28824
3     25495
8     20260
Name: CONTRIBUTING FACTOR VEHICLE 1, dtype: int64

In [88]:
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(len(features))]
vif['Features'] = features
vif

Unnamed: 0,VIF,Features
0,3.226814,CRASH HOUR
1,2.782467,BOROUGH


In [89]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(clas, axis=1), df[clas], test_size=0.3, random_state=143)

In [90]:
cnb = CategoricalNB()
cnb.fit(X_train, y_train)

CategoricalNB()

In [91]:
y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)
# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()

print("CategoricalNB")
print("=" * 30)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))

CategoricalNB
Misclassified samples: 104980
Accuracy: 0.40


In [92]:
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))

Recall score :  0.3991598081523792
Precision score :  0.3991598081523792
F1 score :  0.3991598081523792


In [93]:
print(classification_report(y_test, y_pred_cnb))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     16213
           1       0.40      1.00      0.57     69742
           2       0.00      0.00      0.00     23466
           3       0.00      0.00      0.00      7651
           4       0.00      0.00      0.00     12210
           5       0.00      0.00      0.00     12519
           6       0.00      0.00      0.00      8639
           7       0.00      0.00      0.00      9141
           8       0.00      0.00      0.00      6109
           9       0.00      0.00      0.00      9032

    accuracy                           0.40    174722
   macro avg       0.04      0.10      0.06    174722
weighted avg       0.16      0.40      0.23    174722

