In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [2]:
df = pd.read_csv('./final_data/df_with_topics.csv')

In [3]:
df.head()

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,...,death_date,death_time,death_day,inc_date,inc_time,inc_day,long_topic,best_topic_num,best_topic_name,best_topic_perc
0,2023-10-24 00:43:00,2023-10-25 00:11:00,22.0,1,Black,0,ACCIDENT,MULTIPLE BLUNT FORCE INJURIES. MOTOR VEHICLE C...,MULTIPLE BLUNT FORCE INJURIES,MOTOR VEHICLE COLLISION,...,2023-10-25,00:11:00,Wednesday,2023-10-24,00:43:00,Tuesday,"[(0, 0.0100428155), (1, 0.01559641), (2, 0.948...",2,vehicle_collision,0.948506
1,2023-10-24 22:30:00,2023-10-24 21:51:00,35.0,0,Black,0,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-24,21:51:00,Tuesday,2023-10-24,22:30:00,Tuesday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878
2,2023-10-24 21:18:00,2023-10-24 20:36:00,54.0,0,White,0,SUICIDE,GUNSHOT WOUND OF HEAD,GUNSHOT WOUND OF HEAD,no_text,...,2023-10-24,20:36:00,Tuesday,2023-10-24,21:18:00,Tuesday,"[(0, 0.90828776), (1, 0.029466497), (2, 0.0133...",0,one_gunshot_wound,0.908288
3,2023-10-24 07:48:00,2023-10-24 07:16:00,19.0,0,Black,0,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-24,07:16:00,Tuesday,2023-10-24,07:48:00,Tuesday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878
4,2023-10-23 22:21:00,2023-10-23 21:29:00,41.0,0,Black,0,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-23,21:29:00,Monday,2023-10-23,22:21:00,Monday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878


## Basic EDA

In [11]:
df.dropna(inplace=True)

In [12]:
df.shape

(36688, 36)

In [14]:
tr = [datetime.strptime(date,'%Y-%m-%d %H:%M:%S') for date in df['date_of_incident']]

In [17]:
# having a new column that has the hour of which the incident happened
df['hour_of_incident'] = [item.hour for item in tr]

In [19]:
X = df.loc[:,['age','gender','race','death_day','inc_day','best_topic_num','hour_of_incident']]

In [22]:
df['manner_of_death'] = np.where(df['manner_of_death'] == 'ACCIDENT',1,0)

In [25]:
y = df['manner_of_death']

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=404,stratify=y)

In [34]:
mct = make_column_transformer(
    (OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False),['race','death_day','inc_day','best_topic_num','hour_of_incident']),
     remainder='passthrough'
)

In [28]:
y.value_counts(normalize=True)

1    0.671609
0    0.328391
Name: manner_of_death, dtype: float64

In [36]:
X_train_mct = mct.fit_transform(X_train)
X_test_mct = mct.transform(X_test)

## Logistic  Regression Model

In [52]:
lr = LogisticRegression(solver='liblinear')

In [53]:
# logistic regression with gridsearchCV
pgrid = {
    'penalty':['l1', 'l2'],
    'C': np.logspace(0, 1, 100)
}
   


gs = GridSearchCV(lr,param_grid=pgrid,n_jobs=4)

In [54]:
gs.fit(X_train_mct,y_train)

In [55]:
gs.score(X_test_mct,y_test)

0.8365678150894025

In [56]:
gs.best_estimator_

## Random forest

In [64]:
f = X_train_mct.shape[1]

In [70]:
params = {
    'max_depth':np.append(np.arange(1, 50), None),
    'max_features': np.arange(1, f + 1),
    'min_samples_leaf': np.arange(1, 31)
}

rf = RandomForestClassifier(
    n_estimators=100
)

rs = RandomizedSearchCV(
    rf, param_distributions=params, n_iter=100, cv=5, n_jobs=4 
)

In [71]:
rs.fit(X_train_mct,y_train)

In [73]:
rs.best_estimator_

In [72]:
rs.score(X_test_mct,y_test)

0.8847579590056694

In [74]:
import pickle

In [77]:
with open ('./models/lr.pkl','wb') as f:
    pickle.dump(gs,f)

In [78]:
with open ('./models/random_forest.pkl','wb') as f:
    pickle.dump(rs,f)