## LDA on 'primary_cause_line_a' column

Credits:<br>
Ria Kulshrestha, https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2 <br>
Susan Li, https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /Users/sh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
full_df = pd.read_csv('../susan/data/cleaned_data_31OCT.csv')
full_df.sample(5)

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,...,residence_zip,chicago_community_area,covid_related,age_range,death_date,death_time,death_day,inc_date,inc_time,inc_day
13001,2021-03-10 07:35:00,2021-03-10 08:02:00,33.0,0,White,0,SUICIDE,GUNSHOT WOUND OF THE HEAD,GUNSHOT WOUND OF THE HEAD,no_text,...,60655,MOUNT GREENWOOD,0,25-64,2021-03-10,08:02:00,Wednesday,2021-03-10,07:35:00,Wednesday
18131,2009-08-03 00:11:00,2020-01-21 23:15:00,62.0,0,Black,0,HOMICIDE,COMPLICATIONS OF GUNSHOT WOUND OF TORSO,COMPLICATIONS OF GUNSHOT WOUND OF TORSO,no_text,...,60627,AUBURN GRESHAM,0,25-64,2020-01-21,23:15:00,Tuesday,2009-08-03,00:11:00,Monday
31605,2016-06-14 03:41:00,2016-06-14 05:45:00,35.0,0,Black,0,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,60651,AUSTIN,0,25-64,2016-06-14,05:45:00,Tuesday,2016-06-14,03:41:00,Tuesday
12099,2021-05-16 10:55:00,2021-05-16 11:09:00,62.0,0,White,0,SUICIDE,HANGING,HANGING,no_text,...,60465,no_text,0,25-64,2021-05-16,11:09:00,Sunday,2021-05-16,10:55:00,Sunday
987,2023-07-03 03:18:00,2023-07-03 02:03:00,15.0,1,White,1,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,...,60623,SOUTH LAWNDALE,0,15-24,2023-07-03,02:03:00,Monday,2023-07-03,03:18:00,Monday


In [48]:
textdf = full_df[['manner_of_death', 'primary_cause', 'primary_cause_line_a', 
                  'primary_cause_line_b', 'primary_cause_line_c', 'secondary_cause']]
textdf.head(15)

Unnamed: 0,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,primary_cause_line_c,secondary_cause
0,ACCIDENT,MULTIPLE BLUNT FORCE INJURIES. MOTOR VEHICLE C...,MULTIPLE BLUNT FORCE INJURIES,MOTOR VEHICLE COLLISION,no_text,no_text
1,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,no_text,no_text
2,SUICIDE,GUNSHOT WOUND OF HEAD,GUNSHOT WOUND OF HEAD,no_text,no_text,no_text
3,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,no_text,no_text
4,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,no_text,no_text
5,ACCIDENT,MULTIPLE INJURIES. BICYCLIST STRUCK BY MOTOR V...,MULTIPLE INJURIES,BICYCLIST STRUCK BY MOTOR VEHICLE(S),no_text,no_text
6,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,no_text,no_text
7,ACCIDENT,MULTIPLE INJURIES. SCOOTER AND MOTOR VEHICLE C...,MULTIPLE INJURIES,SCOOTER AND MOTOR VEHICLE COLLISION,no_text,no_text
8,SUICIDE,GUNSHOT WOUND TO CHEST,GUNSHOT WOUND TO CHEST,no_text,no_text,no_text
9,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,no_text,no_text


In [49]:
textdf['primary_cause_line_a'][8]

'GUNSHOT WOUND TO CHEST'

In [4]:
# make a stemmer and lemmatizer

wn = WordNetLemmatizer()
stemmer = PorterStemmer()

def lem_stem(text):
    return stemmer.stem(wn.lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lem_stem(token))
    return result

In [50]:
# test it's functioning as expected

doc_sample = textdf[textdf.index == 8].values[0][1]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['GUNSHOT', 'WOUND', 'TO', 'CHEST']


 tokenized and lemmatized document: 
['gunshot', 'wind', 'chest']


In [7]:
# process the 'primary_cause_line_a' column

processed_docs = textdf['primary_cause_line_a'].map(preprocess)
processed_docs[:20]

0     [multipl, blunt, forc, injuri]
1          [multipl, gunshot, wound]
2              [gunshot, wind, head]
3          [multipl, gunshot, wound]
4          [multipl, gunshot, wound]
5                  [multipl, injuri]
6          [multipl, gunshot, wound]
7                  [multipl, injuri]
8             [gunshot, wind, chest]
9          [multipl, gunshot, wound]
10                 [multipl, injuri]
11             [gunshot, wind, head]
12             [gunshot, wind, neck]
13        [complic, multipl, injuri]
14         [multipl, gunshot, wound]
15             [blunt, forc, injuri]
16                 [multipl, injuri]
17                        [asphyxia]
18             [gunshot, wind, head]
19         [multipl, gunshot, wound]
Name: primary_cause_line_a, dtype: object

## 🔥🔥🔥🔥🔥
How do we customize the part-of-speech for 'wound' so that it's always a noun and doesn't sometimes get lemmatized to 'wind'; also check if 'left' is becoming 'leav' 


In [8]:
# Create the Bag of Words (bow) for our data
# first, make a dictionary with words and number of times they occur in the corpus

dictionary = gensim.corpora.Dictionary(processed_docs)

In [9]:
# filter out tokens that appear in too few or too many docs

# these were the example settings; 
# need to check if they work for our data   🔥🔥🔥

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [11]:
# make a dictionary for each document, 
# with how many words (by number of tuples) 
# and which word (based on its number in the dictionary)
# and how many times they appear

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# same example doc as above
# bow_corpus[428]

In [51]:
# printed out nicely

bow_doc_8 = bow_corpus[8]
for i in range(len(bow_doc_8)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_8[i][0], 
                                               dictionary[bow_doc_8[i][0]], 
bow_doc_8[i][1]))

Word 4 ("gunshot") appears 1 time(s).
Word 7 ("wind") appears 1 time(s).
Word 8 ("chest") appears 1 time(s).


In [52]:
# number in dictionary, word, times it appears on document
bow_doc_8[0][0], dictionary[bow_doc_8[0][0]], bow_doc_8[0][1]

(4, 'gunshot', 1)

In [30]:
# create tf-idf model and transform the corpus

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [53]:
# LDA run on bag of words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, 
                                       id2word=dictionary, passes=2, workers=2)


# check the words in each topic and their weights to see if they make sense

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.271*"wind" + 0.256*"gunshot" + 0.134*"head" + 0.052*"chest" + 0.050*"diseas" + 0.046*"cardiovascular" + 0.042*"hypertens" + 0.025*"drown" + 0.019*"torso" + 0.015*"neck"
Topic: 1 
Words: 0.251*"injuri" + 0.165*"forc" + 0.155*"blunt" + 0.112*"multipl" + 0.098*"hang" + 0.040*"complic" + 0.032*"thermal" + 0.026*"craniocerebr" + 0.023*"inhal" + 0.018*"sharp"
Topic: 2 
Words: 0.275*"complic" + 0.186*"injuri" + 0.119*"head" + 0.108*"close" + 0.059*"subdur" + 0.057*"fall" + 0.034*"cervic" + 0.033*"hematoma" + 0.014*"incis" + 0.014*"spine"
Topic: 3 
Words: 0.264*"asphyxia" + 0.153*"head" + 0.106*"blunt" + 0.086*"trauma" + 0.060*"forc" + 0.044*"hypothermia" + 0.035*"pulmonari" + 0.023*"coronari" + 0.019*"thromboembol" + 0.018*"choke"
Topic: 4 
Words: 0.190*"fentanyl" + 0.133*"toxic" + 0.118*"combin" + 0.093*"drug" + 0.067*"heroin" + 0.048*"despropionyl" + 0.034*"acetyl" + 0.034*"anpp" + 0.033*"alprazolam" + 0.026*"ethanol"
Topic: 5 
Words: 0.242*"toxic" + 0.101*"cocain" + 0.09

In [54]:
# check where sample doc would be classified in bag of words model
print(processed_docs[8])

for index, score in sorted(lda_model[bow_corpus[8]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

['gunshot', 'wind', 'chest']

Score: 0.7749970555305481	 
Topic: 0.271*"wind" + 0.256*"gunshot" + 0.134*"head" + 0.052*"chest" + 0.050*"diseas" + 0.046*"cardiovascular" + 0.042*"hypertens" + 0.025*"drown" + 0.019*"torso" + 0.015*"neck"

Score: 0.025002652779221535	 
Topic: 0.393*"multipl" + 0.222*"wound" + 0.210*"gunshot" + 0.151*"injuri" + 0.009*"stab" + 0.004*"intracerebr" + 0.002*"accid" + 0.002*"hemopericardium" + 0.001*"hemorrhag" + 0.001*"cerebrovascular"

Score: 0.025000059977173805	 
Topic: 0.231*"fractur" + 0.165*"complic" + 0.143*"hemorrhag" + 0.100*"right" + 0.095*"leav" + 0.079*"femur" + 0.019*"intracrani" + 0.017*"exposur" + 0.016*"difluoroethan" + 0.015*"cold"

Score: 0.02500005252659321	 
Topic: 0.264*"asphyxia" + 0.153*"head" + 0.106*"blunt" + 0.086*"trauma" + 0.060*"forc" + 0.044*"hypothermia" + 0.035*"pulmonari" + 0.023*"coronari" + 0.019*"thromboembol" + 0.018*"choke"

Score: 0.025000037625432014	 
Topic: 0.275*"complic" + 0.186*"injuri" + 0.119*"head" + 0.108*"close

In [55]:
# LDA run on tfidf

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, 
                                             id2word=dictionary, passes=2, workers=4)

# check the words in each topic and their weights to see if they make sense

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.142*"fentanyl" + 0.130*"heroin" + 0.130*"toxic" + 0.094*"cocain" + 0.078*"combin" + 0.074*"ethanol" + 0.065*"drug" + 0.050*"despropionyl" + 0.034*"anpp" + 0.029*"intox"
Topic: 1 Word: 0.220*"chest" + 0.126*"wind" + 0.090*"gunshot" + 0.076*"thermal" + 0.050*"stab" + 0.043*"injuri" + 0.036*"inhal" + 0.030*"multipl" + 0.026*"wound" + 0.023*"fentanyl"
Topic: 2 Word: 0.200*"blunt" + 0.186*"forc" + 0.106*"injuri" + 0.079*"fall" + 0.069*"complic" + 0.055*"multipl" + 0.050*"head" + 0.032*"trauma" + 0.022*"remot" + 0.015*"neck"
Topic: 3 Word: 0.133*"fractur" + 0.106*"subdur" + 0.093*"hemorrhag" + 0.074*"complic" + 0.065*"hematoma" + 0.058*"leav" + 0.058*"right" + 0.051*"femur" + 0.032*"hypothermia" + 0.022*"acetaminophen"
Topic: 4 Word: 0.332*"injuri" + 0.292*"multipl" + 0.106*"complic" + 0.040*"opiat" + 0.031*"cocain" + 0.020*"intox" + 0.020*"toxic" + 0.014*"gunshot" + 0.012*"fentanyl" + 0.012*"benzodiazepin"
Topic: 5 Word: 0.291*"wind" + 0.218*"head" + 0.200*"gunshot" + 0.078

In [56]:
# check where sample doc would be classified in tfidf model
print(processed_docs[8])

for index, score in sorted(lda_model_tfidf[bow_corpus[8]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

['gunshot', 'wind', 'chest']

Score: 0.7749731540679932	 
Topic: 0.220*"chest" + 0.126*"wind" + 0.090*"gunshot" + 0.076*"thermal" + 0.050*"stab" + 0.043*"injuri" + 0.036*"inhal" + 0.030*"multipl" + 0.026*"wound" + 0.023*"fentanyl"

Score: 0.025018295273184776	 
Topic: 0.291*"wind" + 0.218*"head" + 0.200*"gunshot" + 0.078*"asphyxi" + 0.030*"neck" + 0.026*"abdomen" + 0.020*"intraor" + 0.009*"intracerebr" + 0.007*"shotgun" + 0.007*"complic"

Score: 0.025006569921970367	 
Topic: 0.290*"wound" + 0.205*"gunshot" + 0.192*"multipl" + 0.101*"hang" + 0.043*"diseas" + 0.042*"cardiovascular" + 0.039*"hypertens" + 0.024*"craniocerebr" + 0.007*"atherosclerot" + 0.006*"injuri"

Score: 0.025000691413879395	 
Topic: 0.332*"injuri" + 0.292*"multipl" + 0.106*"complic" + 0.040*"opiat" + 0.031*"cocain" + 0.020*"intox" + 0.020*"toxic" + 0.014*"gunshot" + 0.012*"fentanyl" + 0.012*"benzodiazepin"

Score: 0.025000588968396187	 
Topic: 0.200*"blunt" + 0.186*"forc" + 0.106*"injuri" + 0.079*"fall" + 0.069*"compli