Author: Qingxin Wei

In [1]:
from google.colab import files,drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import shutil
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NPL_GLOVE/df_with_topics.csv')

In [4]:
df.dropna(inplace=True)

In [5]:
df.shape

(36688, 36)

In [6]:
df.head()

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,...,death_date,death_time,death_day,inc_date,inc_time,inc_day,long_topic,best_topic_num,best_topic_name,best_topic_perc
0,2023-10-24 00:43:00,2023-10-25 00:11:00,22.0,1,Black,0,ACCIDENT,MULTIPLE BLUNT FORCE INJURIES. MOTOR VEHICLE C...,MULTIPLE BLUNT FORCE INJURIES,MOTOR VEHICLE COLLISION,...,2023-10-25,00:11:00,Wednesday,2023-10-24,00:43:00,Tuesday,"[(0, 0.0100428155), (1, 0.01559641), (2, 0.948...",2,vehicle_collision,0.948506
1,2023-10-24 22:30:00,2023-10-24 21:51:00,35.0,0,Black,0,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-24,21:51:00,Tuesday,2023-10-24,22:30:00,Tuesday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878
2,2023-10-24 21:18:00,2023-10-24 20:36:00,54.0,0,White,0,SUICIDE,GUNSHOT WOUND OF HEAD,GUNSHOT WOUND OF HEAD,no_text,...,2023-10-24,20:36:00,Tuesday,2023-10-24,21:18:00,Tuesday,"[(0, 0.90828776), (1, 0.029466497), (2, 0.0133...",0,one_gunshot_wound,0.908288
3,2023-10-24 07:48:00,2023-10-24 07:16:00,19.0,0,Black,0,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-24,07:16:00,Tuesday,2023-10-24,07:48:00,Tuesday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878
4,2023-10-23 22:21:00,2023-10-23 21:29:00,41.0,0,Black,0,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,2023-10-23,21:29:00,Monday,2023-10-23,22:21:00,Monday,"[(0, 0.018974014), (1, 0.91878027), (2, 0.0133...",1,gunshot_wounds_fall,0.91878


In [7]:
df['manner_of_death'] = np.where(df['manner_of_death'] == 'ACCIDENT',1,0)

In [8]:
X = df['primary_cause']
y = df['manner_of_death']

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =404)

In [10]:
max_length = 400
max_tokens = 20_000

# Create an object that can process strings into integet incodings based on our
# trainind data vocabulary.
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_length
)

text_vectorization.adapt(X_train)

In [11]:
X_train_int = text_vectorization(X_train)
X_test_int = text_vectorization(X_test)

In [12]:
X_train_int

<tf.Tensor: shape=(27516, 400), dtype=int64, numpy=
array([[ 72,   9,   4, ...,   0,   0,   0],
       [249,  42, 183, ...,   0,   0,   0],
       [ 14,   6,  47, ...,   0,   0,   0],
       ...,
       [  7,  19,   2, ...,   0,   0,   0],
       [  7,  10,   2, ...,   0,   0,   0],
       [  7,  10,   2, ...,   0,   0,   0]])>

In [13]:
embeddings_ix = {}

with open('/content/drive/MyDrive/Colab_Notebooks/glove.6B/glove.6B.100d.txt','r') as f:
  for line in f:
    word, coefs = line.split(maxsplit = 1)
    coefs = np.fromstring(coefs,'f', sep =' ')
    embeddings_ix[word] = coefs

In [14]:
embeddings_ix['cat'][:4]

array([ 0.23088,  0.28283,  0.6318 , -0.59411], dtype=float32)

In [15]:
embedding_dim = 100

vocab = text_vectorization.get_vocabulary()
word_ix = dict(enumerate(vocab))
word_ix = {word:i for i,word in word_ix.items()}

In [16]:
embedding_mx = np.zeros((max_tokens,embedding_dim))

for word,i in word_ix.items():
  if i < max_tokens:
    embed_vector = embeddings_ix.get(word)

  if embed_vector is not None:
    embedding_mx[i,:] = embed_vector

In [17]:
# "forzen" embedding layer
glove_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer = keras.initializers.Constant(embedding_mx),
    trainable = False,
    mask_zero = True
)

In [18]:
inputs = keras.Input(shape=(None,))
embedded = glove_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer='rmsprop', loss='bce', metrics=['acc'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         2000000   
                                                                 
 bidirectional (Bidirection  (None, 64)                34048     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2034113 (7.76 MB)
Trainable params: 34113 (133.25 KB)
Non-trainable params: 2000000 (7.63 MB)
___________________

In [19]:
hsitory = model.fit(
    X_train_int,y_train,
    validation_data = (X_test_int,y_test),
    epochs =5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
import pickle

In [21]:
# with open ('/content/drive/MyDrive/Colab_Notebooks/NPL_GLOVE/rnn.pkl','wb') as f:
#     pickle.dump(model,f)

In [20]:
prediction = model.predict(X_test_int)



In [21]:
prediction

array([[0.9934284 ],
       [0.742874  ],
       [0.2928494 ],
       ...,
       [0.9987847 ],
       [0.01989505],
       [0.9992734 ]], dtype=float32)

In [22]:
ans = [num[0] for num in prediction]

In [23]:
ans_1_0 = [round(num) for num in ans]

In [26]:
with open ('/content/drive/MyDrive/Colab_Notebooks/NPL_GLOVE/ans.pkl','wb') as f:
     pickle.dump(ans,f)

In [27]:
with open ('/content/drive/MyDrive/Colab_Notebooks/NPL_GLOVE/ans01.pkl','wb') as f:
     pickle.dump(ans_1_0,f)

In [28]:
idx = 0
wrong = []
for p,a in zip(ans_1_0,y_test):
  if p!=a:
    wrong.append(idx)
    idx +=1
  else:
    idx +=1

In [33]:
# Predication on Real New Data

In [28]:
unseen = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NPL_GLOVE/serv_data.csv')

In [29]:
update = unseen.loc[:,['case_number','MANNER_OF_DEATH']]

In [30]:
pc = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NPL_GLOVE/Medical_Examiner_Case_Archive_6Nov23.csv')

  pc = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NPL_GLOVE/Medical_Examiner_Case_Archive_6Nov23.csv')


In [31]:
pc_1 = pc.loc[:,['Case Number','Primary Cause','Manner of Death']]

In [32]:
pc_1.head()

Unnamed: 0,Case Number,Primary Cause,Manner of Death
0,ME2023-06609,,
1,ME2023-06608,,
2,ME2023-06607,,
3,ME2023-06606,,
4,ME2023-06605,,


In [33]:
update.head()

Unnamed: 0,case_number,MANNER_OF_DEATH
0,ME2023-06353,ACCIDENT
1,ME2023-06352,SUICIDE
2,ME2023-06347,HOMICIDE
3,ME2023-06344,ACCIDENT
4,ME2023-06343,ACCIDENT


In [34]:
# merging by case number to get the primary cause information from medical dataset
new_set_of_data = pd.merge(update,pc_1,how ='left',left_on = 'case_number',right_on='Case Number')

In [36]:
unseen['MANNER_OF_DEATH'] = np.where(unseen['MANNER_OF_DEATH'] == 'ACCIDENT',1,0)

In [37]:
new_set_of_data.isna().sum()

case_number        0
MANNER_OF_DEATH    0
Case Number        0
Primary Cause      0
Manner of Death    0
dtype: int64

In [38]:
X = new_set_of_data['Primary Cause']

In [39]:
# Preprocessing on X

In [40]:
X_int = text_vectorization(X)

In [41]:
# making prediction on actual unseen data
final_preds = model.predict(X_int)



In [42]:
final_prediction = [num[0] for num in final_preds]

In [43]:
final_0_1 = [round(num) for num in final_prediction]

In [44]:
real_results = np.where(update['MANNER_OF_DEATH']== 'ACCIDENT',1,0)

In [45]:
# tracking the index of the wrong prediction
idx = 0
wrong = []
for p,a in zip(final_0_1,real_results):
  if p!=a:
    wrong.append(idx)
    idx +=1
  else:
    idx +=1

In [46]:
wrong

[50, 71, 87, 90]

In [47]:
wrong_1 = [new_set_of_data['Primary Cause'][50],new_set_of_data['MANNER_OF_DEATH'][50],final_prediction[50]]

In [49]:
wrong_2 = [new_set_of_data['Primary Cause'][71],new_set_of_data['MANNER_OF_DEATH'][71],final_prediction[71]]

In [50]:
wrong_3 = [new_set_of_data['Primary Cause'][87],new_set_of_data['MANNER_OF_DEATH'][87],final_prediction[87]]

In [51]:
wrong_4 = [new_set_of_data['Primary Cause'][90],new_set_of_data['MANNER_OF_DEATH'][90],final_prediction[90]]

In [53]:
data ={
    'Primary Cause': [wrong_1[0],wrong_2[0],wrong_3[0],wrong_4[0]],
    'Actual Manner of Death':[wrong_1[1],wrong_2[1],wrong_3[1],wrong_4[1]],
    "Confidence Level of Model's Predication" : [wrong_1[2],wrong_2[2],wrong_3[2],wrong_4[2]]
}

In [54]:
final_df = pd.DataFrame(data)

In [57]:
final_df.to_csv('/content/drive/MyDrive/Colab_Notebooks/NPL_GLOVE/final_results_on_unseen_data.csv')

In [55]:
# predictions that were wrong on actual unseen data
final_df

Unnamed: 0,Primary Cause,Actual Manner of Death,Confidence Level of Model's Predication
0,"COMBINED DRUG (TRAMADOL, GABAPENTIN, AND MORPH...",SUICIDE,0.559559
1,"COMBINED ESZOPICLONE/ZOPICLONE, ZOLPIDEM, TRAZ...",SUICIDE,0.724511
2,DROWNING. NEGLECT IN BATHTUB,HOMICIDE,0.963549
3,COMPLICATIONS OF CRANIOCEREBRAL INJURIES,HOMICIDE,0.774583


In [56]:
# baseline of unseen data
pd.Series(real_results).value_counts(normalize =True)

1    0.868132
0    0.131868
dtype: float64