Author: Susan Hopper

## LDA Topic Modeling on full primary cause, for accidental deaths only
Credit where credit is due! Based on the work of Selva Prabhakaran, https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [None]:
# in CLI:
# pip install pyLDAvis

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from sklearn.model_selection import GridSearchCV

import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

import pickle

In [2]:
# Read in the cleaned data
df = pd.read_csv('../1_data/cleaned_data_31OCT.csv')

In [None]:
df.head(2)

In [None]:
df['manner_of_death'].value_counts()

In [3]:
# Make a dataframe of just 'ACCIDENT' manner of death
df_acc = df[df['manner_of_death']=='ACCIDENT']

In [None]:
# Reset the index
# df_acc.reset_index(drop=True, inplace=True)
# df_acc.head(1)

In [None]:
df_acc.columns

In [4]:
# Find accidental deaths that aren't already classified by the ME's office

unclass_acc = df_acc[(df_acc['gun_related']==0) & (df_acc['opioid_related']==0) & (df_acc['cold_related']==0) & (df_acc['heat_related']==0)]

In [None]:
# Reset the index
# unclass_acc.reset_index(drop=True, inplace=True)
# unclass_acc.head(1)

In [None]:
unclass_acc.index

In [5]:
# Define a stemming function

stop_words = stopwords.words('english')
ps = PorterStemmer()

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 2:
            result.append(ps.stem(token))
    return result

In [None]:
# Test it's functioning as expected

doc_sample = unclass_acc[unclass_acc.index == 1001].values[0][8]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

print('\nstemmed document: ')
print(preprocess(doc_sample))

In [6]:
# Process the 'primary_cause' column

data_words = unclass_acc['primary_cause'].map(preprocess).tolist()

In [7]:
# Build bigram model and function, and make bigrams

bigram = gensim.models.Phrases(data_words, min_count=1, threshold=1) 

bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

data_words_bigrams = make_bigrams(data_words)

data_words[:5]

[['multipl', 'blunt', 'forc', 'injuri', 'motor', 'vehicl', 'collis'],
 ['multipl', 'injuri', 'bicyclist', 'struck', 'motor', 'vehicl'],
 ['multipl', 'injuri', 'scooter', 'motor', 'vehicl', 'collis'],
 ['complic', 'multipl', 'injuri', 'fall', 'ladder'],
 ['multipl', 'injuri', 'motor', 'vehicl', 'strike', 'pedestrian']]

In [8]:
# Create dictionary for the model
id2word = corpora.Dictionary(data_words_bigrams)

# Create corpus for the model
texts = data_words_bigrams

# Find term document frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Check the processing with human-readable format of corpus (term, frequency in doc)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[25:30]]

In [9]:
# Build LDA model on full accident dataset

lda_model_accid = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=3,
                                               random_state=42,
                                               update_every=1,
                                               chunksize=3000,
                                               passes=20,
                                               alpha='auto',
                                               per_word_topics=True)


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_accid, corpus, id2word)

vis

In [None]:
# Compute perplexity, a measure of how good the model is. Lower is better.
print('\nPerplexity: ', lda_model_accid.log_perplexity(corpus))  

# Compute coherence score, a measure of how well the elements of the topic support each other. 
# Cohesion relates to human comprehension better than perplexity
coherence_model_lda = CoherenceModel(model=lda_model_accid, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Thanks, Tim!
lda_model_accid.get_document_topics(corpus[0])

In [None]:
# Print keywords in each topic
pprint(lda_model_accid.print_topics())
doc_lda = lda_model_accid[corpus]

In [10]:
# Define a function to find the most relevant topic
def best_topic(corpus):
    best_topic_list = []
    for i in range(0, len(corpus)):
        best_topic = (sorted(lda_model_accid.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True))[0][0]
        best_topic_list.append(best_topic)
    return best_topic_list

best_topic_column = best_topic(corpus)

In [11]:
# Define a function to get the percentage for the most relevant topic
def topic_perc(corpus):
    topic_perc_list = []
    for i in range(0, len(corpus)):
        perc_topic = (sorted(lda_model_accid.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True))[0][1]
        topic_perc_list.append(perc_topic)
    return topic_perc_list

topic_perc_column = topic_perc(corpus)

In [12]:
# Add topic columns to the smaller df
unclass_acc['long_topic'] = lda_model_accid.get_document_topics(corpus)
unclass_acc['best_topic_num'] = best_topic_column
unclass_acc['best_topic_name'] = unclass_acc['best_topic_num'].map(
                                                {0:'vehicle_collision', 
                                                 1:'fall', 
                                                 2:'choking_misc'})
unclass_acc['best_topic_perc'] = topic_perc_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unclass_acc['long_topic'] = lda_model_accid.get_document_topics(corpus)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unclass_acc['best_topic_num'] = best_topic_column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unclass_acc['best_topic_name'] = unclass_acc['best_topic_num'].map(
A value is tryi

In [None]:
# Check how it looks
unclass_acc[['primary_cause_line_a', 'primary_cause_line_b', 'long_topic', 'best_topic_num', 'best_topic_name','best_topic_perc', ]].sample(10)

In [13]:
# Merge the topic columns to the larger accident df

df_acc_topics = pd.merge(left = df_acc, right = unclass_acc, how='left')

In [14]:
df_acc_topics.head(10)

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,...,death_date,death_time,death_day,inc_date,inc_time,inc_day,long_topic,best_topic_num,best_topic_name,best_topic_perc
0,2023-10-24 00:43:00,2023-10-25 00:11:00,22.0,1,Black,0,ACCIDENT,MULTIPLE BLUNT FORCE INJURIES. MOTOR VEHICLE C...,MULTIPLE BLUNT FORCE INJURIES,MOTOR VEHICLE COLLISION,...,2023-10-25,00:11:00,Wednesday,2023-10-24,00:43:00,Tuesday,"[(0, 0.91106987), (1, 0.06506033), (2, 0.02386...",0.0,vehicle_collision,0.91107
1,2023-10-23 20:52:00,2023-10-23 20:32:00,59.0,0,White,0,ACCIDENT,MULTIPLE INJURIES. BICYCLIST STRUCK BY MOTOR V...,MULTIPLE INJURIES,BICYCLIST STRUCK BY MOTOR VEHICLE(S),...,2023-10-23,20:32:00,Monday,2023-10-23,20:52:00,Monday,"[(0, 0.8928988), (1, 0.07840207), (2, 0.028699...",0.0,vehicle_collision,0.892898
2,2023-10-23 16:41:00,2023-10-23 15:29:00,79.0,1,White,0,ACCIDENT,MULTIPLE INJURIES. SCOOTER AND MOTOR VEHICLE C...,MULTIPLE INJURIES,SCOOTER AND MOTOR VEHICLE COLLISION,...,2023-10-23,15:29:00,Monday,2023-10-23,16:41:00,Monday,"[(0, 0.9110208), (1, 0.06510562), (2, 0.023873...",0.0,vehicle_collision,0.911021
3,,2023-10-22 11:28:00,58.0,0,White,0,ACCIDENT,COMPLICATIONS OF MULTIPLE INJURIES. FALL OFF L...,COMPLICATIONS OF MULTIPLE INJURIES,FALL OFF LADDER,...,2023-10-22,11:28:00,Sunday,,,,"[(0, 0.69110024), (1, 0.28020066), (2, 0.02869...",0.0,vehicle_collision,0.691088
4,2023-10-19 19:48:00,2023-10-21 19:37:00,72.0,1,White,1,ACCIDENT,MULTIPLE INJURIES. MOTOR VEHICLE STRIKING PEDE...,MULTIPLE INJURIES,MOTOR VEHICLE STRIKING PEDESTRIAN,...,2023-10-21,19:37:00,Saturday,2023-10-19,19:48:00,Thursday,"[(0, 0.8929103), (1, 0.078391515), (2, 0.02869...",0.0,vehicle_collision,0.892911
5,2023-10-21 08:04:00,2023-10-21 06:45:00,68.0,0,Black,0,ACCIDENT,COMPLICATIONS OF REMOTE DRUG INTOXICATION,COMPLICATIONS OF REMOTE DRUG INTOXICATION,no_text,...,2023-10-21,06:45:00,Saturday,2023-10-21,08:04:00,Saturday,"[(0, 0.28813913), (1, 0.6831598), (2, 0.028701...",1.0,fall,0.68316
6,2023-09-28 15:08:00,2023-10-09 13:48:00,63.0,0,Black,0,ACCIDENT,COMPLICATIONS OF ETHANOL AND METHADONE INTOXIC...,COMPLICATIONS OF ETHANOL AND METHADONE INTOXIC...,no_text,...,2023-10-09,13:48:00,Monday,2023-09-28,15:08:00,Thursday,,,,
7,2023-10-12 03:39:00,2023-10-21 01:31:00,88.0,1,White,0,ACCIDENT,COMPLICATIONS OF BLUNT FORCE INJURIES OF LEFT ...,COMPLICATIONS OF BLUNT FORCE INJURIES OF LEFT HIP,FALL,...,2023-10-21,01:31:00,Saturday,2023-10-12,03:39:00,Thursday,"[(0, 0.3576391), (1, 0.61849093), (2, 0.023869...",1.0,fall,0.618486
8,2023-10-20 21:00:00,2023-10-20 20:03:00,36.0,0,White,1,ACCIDENT,MULTIPLE INJURIES. MOTORCYCLE STRIKING MOTOR V...,MULTIPLE INJURIES,MOTORCYCLE STRIKING MOTOR VEHICLE,...,2023-10-20,20:03:00,Friday,2023-10-20,21:00:00,Friday,"[(0, 0.7430392), (1, 0.065204225), (2, 0.19175...",0.0,vehicle_collision,0.743039
9,2023-09-27 13:28:00,2023-10-19 10:24:00,52.0,0,White,1,ACCIDENT,COMPLICATIONS FOLLOWING THERMAL AND INHALATION...,COMPLICATIONS FOLLOWING THERMAL AND INHALATION...,RECREATIONAL VEHICLE FIRE,...,2023-10-19,10:24:00,Thursday,2023-09-27,13:28:00,Wednesday,"[(0, 0.7729235), (1, 0.20663129), (2, 0.020445...",0.0,vehicle_collision,0.772923


In [16]:
# Save the df as a csv

df_acc_topics.to_csv('../1_data/accidental_death_topics.csv', index=False)