## LDA on full 'primary_cause' column

Credits:<br>
Ria Kulshrestha, https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2 <br>
Susan Li, https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

# nltk.download('wordnet') # only need to do once


In [3]:
df = pd.read_csv('../susan/data/cleaned_data_31OCT.csv')
df.sample(5)

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,...,residence_zip,chicago_community_area,covid_related,age_range,death_date,death_time,death_day,inc_date,inc_time,inc_day
340,2023-09-03 06:06:00,2023-09-03 05:44:00,30.0,0,White,1,HOMICIDE,GUNSHOT WOUND TO TORSO,GUNSHOT WOUND TO TORSO,no_text,...,60411,no_text,0,25-64,2023-09-03,05:44:00,Sunday,2023-09-03,06:06:00,Sunday
15322,2020-09-05 13:54:00,2020-09-05 14:19:00,42.0,0,Black,0,ACCIDENT,"COMBINED DRUG (FENTANYL, DESPROPIONYL FENTANYL...","COMBINED DRUG (FENTANYL, DESPROPIONYL FENTANYL...",no_text,...,60636,WEST ENGLEWOOD,0,25-64,2020-09-05,14:19:00,Saturday,2020-09-05,13:54:00,Saturday
4485,2022-10-25 08:25:00,2022-10-25 08:41:00,48.0,0,White,0,ACCIDENT,"COMBINED FENTANYL, PROBABLE 8-AMINOCLONAZOLAM....","COMBINED FENTANYL, PROBABLE 8-AMINOCLONAZOLAM...",…AND PROBABLE ISOTONITAZENE/PROTONITAZENE TOXI...,...,60106,no_text,0,25-64,2022-10-25,08:41:00,Tuesday,2022-10-25,08:25:00,Tuesday
19330,2019-10-23 14:21:00,2019-10-23 14:33:00,35.0,0,White,1,ACCIDENT,COCAINE AND FENTANYL TOXICITY,COCAINE AND FENTANYL TOXICITY,no_text,...,60629,GAGE PARK,0,25-64,2019-10-23,14:33:00,Wednesday,2019-10-23,14:21:00,Wednesday
13031,2021-03-07 18:48:00,2021-03-07 19:07:00,63.0,0,Black,0,ACCIDENT,"COMBINED FENTANYL, DESPROPIONYL FENTANYL (4-AN...","COMBINED FENTANYL, DESPROPIONYL FENTANYL (4-AN...",no_text,...,60406,no_text,0,25-64,2021-03-07,19:07:00,Sunday,2021-03-07,18:48:00,Sunday


In [4]:
# make a stemmer and lemmatizer
# skipping the lemmatizer b/c we don't have many verbs
# and it's messing up non-verbs like turning 'wound' into wind' and 'left' into 'leav'

# wn = WordNetLemmatizer()
ps = PorterStemmer()

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(ps.stem(token))
    return result

In [11]:
# test it's functioning as expected

doc_sample = df[df.index == 428].values[0][8]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\nstemmed document: ')
print(preprocess(doc_sample))

original document: 
['COMBINED', 'ETHANOL,', 'COCAINE,', 'FENTANYL,', 'ACETYL', 'FENTANYL,', 'DESPROPIONYL', 'FENTANYL(4-ANPP),', 'AND', 'HEROIN', 'TOXICITY']

stemmed document: 
['combin', 'ethanol', 'cocain', 'fentanyl', 'acetyl', 'fentanyl', 'despropionyl', 'fentanyl', 'anpp', 'heroin', 'toxic']


In [9]:
# process the 'primary_cause' column

processed_docs = df['primary_cause'].map(preprocess)
processed_docs[:20]

0     [multipl, blunt, forc, injuri, motor, vehicl, ...
1                             [multipl, gunshot, wound]
2                                [gunshot, wound, head]
3                             [multipl, gunshot, wound]
4                             [multipl, gunshot, wound]
5     [multipl, injuri, bicyclist, struck, motor, ve...
6                             [multipl, gunshot, wound]
7     [multipl, injuri, scooter, motor, vehicl, collis]
8                               [gunshot, wound, chest]
9                             [multipl, gunshot, wound]
10                      [multipl, injuri, jump, height]
11                               [gunshot, wound, head]
12                               [gunshot, wound, neck]
13             [complic, multipl, injuri, fall, ladder]
14                            [multipl, gunshot, wound]
15                  [blunt, forc, injuri, fall, height]
16    [multipl, injuri, motor, vehicl, strike, pedes...
17                                     [asphyxia

In [13]:
# Create the Bag of Words (bow) for our data
# first, make a dictionary with words and number of times they occur in the corpus

dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
# dictionary.token2id

In [None]:
# dictionary.cfs

In [14]:
# filter out tokens that appear in too few or too many docs

# these were the example settings; 
# need to check if they work for our data   🔥🔥🔥

dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100_000)

In [15]:
# make a dictionary for each document, 
# with how many words (by number of tuples) 
# and which word (based on its number in the dictionary)
# and how many times they appear

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# same example doc as above
bow_corpus[428]

[(27, 1),
 (59, 3),
 (60, 1),
 (66, 1),
 (70, 1),
 (85, 1),
 (86, 1),
 (125, 1),
 (134, 1)]

In [16]:
# printed out nicely

bow_doc_428 = bow_corpus[428]
for i in range(len(bow_doc_428)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_428[i][0], 
                                               dictionary[bow_doc_428[i][0]], 
bow_doc_428[i][1]))

Word 27 ("ethanol") appears 1 time(s).
Word 59 ("fentanyl") appears 3 time(s).
Word 60 ("toxic") appears 1 time(s).
Word 66 ("cocain") appears 1 time(s).
Word 70 ("combin") appears 1 time(s).
Word 85 ("anpp") appears 1 time(s).
Word 86 ("despropionyl") appears 1 time(s).
Word 125 ("heroin") appears 1 time(s).
Word 134 ("acetyl") appears 1 time(s).


In [17]:
# number in dictionary, word, times it appears on document
bow_doc_428[0][0], dictionary[bow_doc_428[0][0]], bow_doc_428[0][1]

(27, 'ethanol', 1)

In [20]:
# LDA run on bag of words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, 
                                       id2word=dictionary, passes=2, workers=2)


# check the words in each topic and their weights to see if they make sense

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.183*"head" + 0.148*"injuri" + 0.145*"fall" + 0.111*"close" + 0.068*"complic" + 0.064*"blunt" + 0.042*"forc" + 0.039*"torso" + 0.038*"trauma" + 0.032*"unwit"
Topic: 1 
Words: 0.153*"fentanyl" + 0.091*"toxic" + 0.069*"combin" + 0.068*"para" + 0.060*"despropionyl" + 0.051*"fluorofentanyl" + 0.050*"drug" + 0.043*"anpp" + 0.041*"cocain" + 0.040*"probabl"
Topic: 2 
Words: 0.343*"wound" + 0.323*"gunshot" + 0.165*"multipl" + 0.087*"head" + 0.033*"chest" + 0.014*"stab" + 0.009*"neck" + 0.007*"intraor" + 0.006*"incis" + 0.005*"complic"
Topic: 3 
Words: 0.221*"fall" + 0.203*"complic" + 0.132*"fractur" + 0.066*"drown" + 0.058*"left" + 0.057*"right" + 0.042*"femur" + 0.021*"wit" + 0.012*"hemorrhag" + 0.011*"unwit"
Topic: 4 
Words: 0.145*"intox" + 0.120*"acut" + 0.097*"toxic" + 0.071*"alcohol" + 0.068*"complic" + 0.066*"cocain" + 0.060*"ethanol" + 0.042*"food" + 0.042*"choke" + 0.035*"bolu"
Topic: 5 
Words: 0.198*"toxic" + 0.119*"cocain" + 0.072*"inhal" + 0.070*"opiat" + 0.052*"in

In [25]:
# check where sample doc would be classified in bag of words model
print(processed_docs[30500])

for index, score in sorted(lda_model[bow_corpus[30500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

['gunshot', 'wound', 'head']

Score: 0.7624998092651367	 
Topic: 0.343*"wound" + 0.323*"gunshot" + 0.165*"multipl" + 0.087*"head" + 0.033*"chest" + 0.014*"stab" + 0.009*"neck" + 0.007*"intraor" + 0.006*"incis" + 0.005*"complic"

Score: 0.01250001136213541	 
Topic: 0.183*"head" + 0.148*"injuri" + 0.145*"fall" + 0.111*"close" + 0.068*"complic" + 0.064*"blunt" + 0.042*"forc" + 0.039*"torso" + 0.038*"trauma" + 0.032*"unwit"

Score: 0.012500008568167686	 
Topic: 0.153*"fentanyl" + 0.091*"toxic" + 0.069*"combin" + 0.068*"para" + 0.060*"despropionyl" + 0.051*"fluorofentanyl" + 0.050*"drug" + 0.043*"anpp" + 0.041*"cocain" + 0.040*"probabl"

Score: 0.012500008568167686	 
Topic: 0.221*"fall" + 0.203*"complic" + 0.132*"fractur" + 0.066*"drown" + 0.058*"left" + 0.057*"right" + 0.042*"femur" + 0.021*"wit" + 0.012*"hemorrhag" + 0.011*"unwit"

Score: 0.012500008568167686	 
Topic: 0.145*"intox" + 0.120*"acut" + 0.097*"toxic" + 0.071*"alcohol" + 0.068*"complic" + 0.066*"cocain" + 0.060*"ethanol" + 0.04

In [26]:
# create tf-idf model and transform the corpus

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [28]:
# LDA run on tfidf

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, 
                                             id2word=dictionary, passes=2, workers=4)

# check the words in each topic and their weights to see if they make sense

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.138*"height" + 0.077*"fall" + 0.063*"injuri" + 0.049*"jump" + 0.046*"multipl" + 0.038*"home" + 0.037*"wit" + 0.033*"complic" + 0.024*"delay" + 0.021*"spinal"
Topic: 1 Word: 0.091*"motor" + 0.090*"vehicl" + 0.090*"blunt" + 0.084*"forc" + 0.083*"injuri" + 0.082*"collis" + 0.054*"multipl" + 0.042*"hemorrhag" + 0.034*"fall" + 0.030*"crash"
Topic: 2 Word: 0.149*"hang" + 0.115*"asphyxi" + 0.114*"asphyxia" + 0.076*"stair" + 0.069*"craniocerebr" + 0.066*"fall" + 0.047*"injuri" + 0.027*"complic" + 0.019*"hydromorphon" + 0.017*"oxycodon"
Topic: 3 Word: 0.070*"hematoma" + 0.062*"subdur" + 0.056*"diphenhydramin" + 0.052*"fentanyl" + 0.043*"toxic" + 0.043*"combin" + 0.041*"drug" + 0.036*"hydrocodon" + 0.034*"tramadol" + 0.031*"fall"
Topic: 4 Word: 0.146*"intox" + 0.118*"complic" + 0.064*"fall" + 0.062*"opiat" + 0.040*"heroin" + 0.037*"opioid" + 0.035*"benzodiazepin" + 0.035*"cocain" + 0.035*"choke" + 0.033*"food"
Topic: 5 Word: 0.303*"gunshot" + 0.294*"wound" + 0.193*"multipl" + 0.

In [29]:
# check where sample doc would be classified in tfidf model
print(processed_docs[428])

for index, score in sorted(lda_model_tfidf[bow_corpus[428]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

['combin', 'ethanol', 'cocain', 'fentanyl', 'acetyl', 'fentanyl', 'despropionyl', 'fentanyl', 'anpp', 'heroin', 'toxic']

Score: 0.9208332300186157	 
Topic: 0.211*"fentanyl" + 0.091*"toxic" + 0.087*"despropionyl" + 0.082*"combin" + 0.076*"anpp" + 0.073*"drug" + 0.067*"acetyl" + 0.057*"heroin" + 0.051*"cocain" + 0.039*"ethanol"
