In [32]:
import os
import shutil
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [33]:
# UNCOMMENT BELOW AND RUN ASAP!
# This cell can take up to 5 minutes to run, as it's downloading and unzipping
# a large file. Once run, please re-comment it so you don't accidentally run
# it twice!

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2023-11-07 19:28:07--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-11-07 19:28:07--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-11-07 19:28:07--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [34]:
embeddings_ix = {}

with open('./glove.6B.100d.txt', 'r') as f:
  for line in f:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, 'f', sep=' ')
    embeddings_ix[word] = coefs

In [35]:
df =pd.read_csv('df_with_topics.csv')

In [36]:
df.manner_of_death = df.manner_of_death.map({'ACCIDENT':1, 'HOMICIDE':0, 'SUICIDE': 0})

In [37]:
df.dropna(inplace= True)# 36688 vs 37151

In [5]:
"""X = df.drop(columns=['manner_of_death','race','best_topic_perc','best_topic_name','best_topic_num','long_topic','inc_date',
                     'primary_cause_line_a','primary_cause_line_b','primary_cause_line_c',''])
y = df.manner_of_death"""

In [38]:
df['full_text'] = df['primary_cause'] +' '+ df['secondary_cause']

In [39]:
full_text = df['full_text']

In [19]:
full_text.shape

(34275,)

In [20]:
type(full_text)

pandas.core.series.Series

In [40]:
X = full_text
y= df.manner_of_death

In [7]:
y.shape

(36688,)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 512, stratify= y)

In [52]:
max_length = 50
max_tokens = 20_000

# Create an object that can process strings into integet incodings based on our
# trainind data vocabulary.
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_length
)
text_vectorization.adapt(X_train)
X_train_int = text_vectorization(X_train)
X_test_int = text_vectorization(X_test)

In [53]:
# # Create datasets that turn our text input into integer encodings
# int_mapper = lambda x, y: (text_vectorization(x), y)

# int_train_ds = X_train.map(int_mapper, num_parallel_calls=4)
# # int_val_ds = val_ds.map(int_mapper, num_parallel_calls=4)
# # int_test_ds = test_ds.map(int_mapper, num_parallel_calls=4)

In [54]:
embedding_dim = 100

vocab = text_vectorization.get_vocabulary()
word_ix = dict(enumerate(vocab))
word_ix = {word: i for i, word in word_ix.items()}

In [55]:
# Create a huge embedding matrix based on GloVe - no need to train it ourself!
# 1 row = 1 word vector
embedding_mx = np.zeros((max_tokens, embedding_dim))

for word, i in word_ix.items():
  if i < max_tokens:
    embed_vector = embeddings_ix.get(word)

  if embed_vector is not None:
    embedding_mx[i, :] = embed_vector

In [56]:
glove_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_mx),
    trainable=False,
    mask_zero=True
)

In [57]:
inputs = keras.Input(shape=(None,))
embedded = glove_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer='rmsprop', loss='bce', metrics=['acc'])
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         2000000   
                                                                 
 bidirectional_2 (Bidirecti  (None, 64)                34048     
 onal)                                                           
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2034113 (7.76 MB)
Trainable params: 34113 (133.25 KB)
Non-trainable params: 2000000 (7.63 MB)
_________________

In [58]:
model.fit(
    X_train_int,y_train,
    validation_data=(X_test_int,y_test),
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79197ca101f0>

In [61]:
model.save("my_model.keras")

In [62]:
# Load the model
loaded_model = keras.models.load_model("my_model.keras")



In [66]:
text_vectorization("BLUNT FORCE TRAUMA")

<tf.Tensor: shape=(50,), dtype=int64, numpy=
array([ 28,  27, 120,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])>

In [69]:
# Use the loaded model for predictions
tvec = text_vectorization("BLUNT FORCE TRAUMA")

predictions = loaded_model.predict([tvec])

ValueError: ignored