In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('data/df_with_topics_testing.csv')

In [6]:
df.manner_of_death = df.manner_of_death.map({'ACCIDENT':1, 'HOMICIDE':0, 'SUICIDE': 0})

In [7]:
df.manner_of_death.value_counts()

1    25003
0    12148
Name: manner_of_death, dtype: int64

In [12]:
df.head()

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,...,death_date,death_time,death_day,inc_date,inc_time,inc_day,long_topic,best_topic_num,best_topic_name,best_topic_perc
0,2023-10-24 00:43:00,2023-10-25 00:11:00,22.0,1,Black,0,1,MULTIPLE BLUNT FORCE INJURIES. MOTOR VEHICLE C...,MULTIPLE BLUNT FORCE INJURIES,MOTOR VEHICLE COLLISION,...,2023-10-25,00:11:00,Wednesday,2023-10-24,00:43:00,Tuesday,"[(0, 0.0100428155), (1, 0.01559641), (2, 0.948...",2,vehicle_collision,0.948506
1,2023-10-24 22:30:00,2023-10-24 21:51:00,35.0,0,Black,0,0,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-24,21:51:00,Tuesday,2023-10-24,22:30:00,Tuesday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878
2,2023-10-24 21:18:00,2023-10-24 20:36:00,54.0,0,White,0,0,GUNSHOT WOUND OF HEAD,GUNSHOT WOUND OF HEAD,no_text,...,2023-10-24,20:36:00,Tuesday,2023-10-24,21:18:00,Tuesday,"[(0, 0.90828776), (1, 0.029466497), (2, 0.0133...",0,one_gunshot_wound,0.908288
3,2023-10-24 07:48:00,2023-10-24 07:16:00,19.0,0,Black,0,0,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-24,07:16:00,Tuesday,2023-10-24,07:48:00,Tuesday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878
4,2023-10-23 22:21:00,2023-10-23 21:29:00,41.0,0,Black,0,0,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-23,21:29:00,Monday,2023-10-23,22:21:00,Monday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878


In [10]:
df.columns

Index(['date_of_incident', 'date_of_death', 'age', 'gender', 'race', 'latino',
       'manner_of_death', 'primary_cause', 'primary_cause_line_a',
       'primary_cause_line_b', 'primary_cause_line_c', 'secondary_cause',
       'gun_related', 'opioid_related', 'cold_related', 'heat_related',
       'commissioner_district', 'incident_city', 'incident_zip_code',
       'longitude', 'latitude', 'residence_city', 'residence_zip',
       'chicago_community_area', 'covid_related', 'age_range', 'death_date',
       'death_time', 'death_day', 'inc_date', 'inc_time', 'inc_day',
       'long_topic', 'best_topic_num', 'best_topic_name', 'best_topic_perc'],
      dtype='object')

In [13]:
df.dropna(inplace= True)# 36688 vs 37151 

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36688 entries, 0 to 37150
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   date_of_incident        36688 non-null  object 
 1   date_of_death           36688 non-null  object 
 2   age                     36688 non-null  float64
 3   gender                  36688 non-null  int64  
 4   race                    36688 non-null  object 
 5   latino                  36688 non-null  int64  
 6   manner_of_death         36688 non-null  int64  
 7   primary_cause           36688 non-null  object 
 8   primary_cause_line_a    36688 non-null  object 
 9   primary_cause_line_b    36688 non-null  object 
 10  primary_cause_line_c    36688 non-null  object 
 11  secondary_cause         36688 non-null  object 
 12  gun_related             36688 non-null  int64  
 13  opioid_related          36688 non-null  int64  
 14  cold_related            36688 non-null

In [19]:
df.inc_time = df.inc_time.str[:2]

In [21]:
X = df.drop(columns=['manner_of_death','race','best_topic_perc','best_topic_name','best_topic_num','long_topic','inc_date'])
y = df.manner_of_death

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=512, stratify=y)

In [31]:
y_train.value_counts(normalize=True)

1    0.671609
0    0.328391
Name: manner_of_death, dtype: float64

In [41]:
y_train

28637    1
29180    1
19489    0
15489    0
20338    0
        ..
9765     1
37132    0
13043    1
8834     1
5819     1
Name: manner_of_death, Length: 27516, dtype: int64

In [34]:
y_train.shape

(27516,)

In [33]:
y_test.value_counts(normalize=True)

1    0.671609
0    0.328391
Name: manner_of_death, dtype: float64

In [35]:
y_test.shape

(9172,)

In [32]:
y.value_counts(normalize=True)

1    0.671609
0    0.328391
Name: manner_of_death, dtype: float64

In [36]:
y.shape

(36688,)

In [42]:
accidents = df[df['manner_of_death'] == 'ACCIDENT']
Xaccidents= accidents.drop(columns=['manner_of_death','race','best_topic_perc','best_topic_name','best_topic_num','long_topic','inc_date'])
yaccidents = accidents.manner_of_death
non_accidents = df[df['manner_of_death'] != 'ACCIDENT']
Xnon_accidents= non_accidents.drop(columns=['manner_of_death','race','best_topic_perc','best_topic_name','best_topic_num','long_topic','inc_date'])
ynon_accidents = non_accidents.manner_of_death

In [46]:
yaccidents

Series([], Name: manner_of_death, dtype: int64)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=512, stratify=y)