## LDA on full 'primary_cause' column

Credits:<br>
Ria Kulshrestha, https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2 <br>
Susan Li, https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem.porter import *
import nltk
# nltk.download('wordnet') # only need to do once?


[nltk_data] Downloading package wordnet to /Users/sh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
full_df = pd.read_csv('../susan/data/cleaned_data_31OCT.csv')
full_df.sample(5)

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,...,residence_zip,chicago_community_area,covid_related,age_range,death_date,death_time,death_day,inc_date,inc_time,inc_day
24596,2018-05-16 18:08:00,2018-05-16 18:18:00,29.0,0,White,0,ACCIDENT,"COMBINED DRUG (ETHANOL, BUTALBITAL, DIAZEPAM, ...","COMBINED DRUG (ETHANOL, BUTALBITAL, DIAZEPAM, ...",no_text,...,60164,DUNNING,0,25-64,2018-05-16,18:18:00,Wednesday,2018-05-16,18:08:00,Wednesday
19518,2019-09-17 00:00:00,2019-10-04 18:10:00,83.0,0,Black,0,ACCIDENT,COMPLICATIONS OF CLOSED HEAD INJURY. FALL,COMPLICATIONS OF CLOSED HEAD INJURY,FALL,...,60155,no_text,0,65+,2019-10-04,18:10:00,Friday,2019-09-17,00:00:00,Tuesday
33861,2015-09-29 21:43:00,2015-10-02 15:40:00,51.0,1,Black,0,ACCIDENT,COMPLICATIONS OF OPIATE TOXICITY,COMPLICATIONS OF OPIATE TOXICITY,no_text,...,60624,WEST GARFIELD PARK,0,25-64,2015-10-02,15:40:00,Friday,2015-09-29,21:43:00,Tuesday
34312,2015-08-14 14:31:00,2015-08-14 15:20:00,45.0,0,White,1,SUICIDE,GUNSHOT WOUND TO NECK,GUNSHOT WOUND TO NECK,no_text,...,60487,LOOP,0,25-64,2015-08-14,15:20:00,Friday,2015-08-14,14:31:00,Friday
17064,2020-05-07 15:24:00,2020-05-07 15:55:00,68.0,0,White,0,ACCIDENT,"COMBINED DRUG (FENTANYL, DESPROPIONYL FENTANYL...","COMBINED DRUG (FENTANYL, DESPROPIONYL FENTANYL...",no_text,...,no_text,BRIGHTON PARK,0,65+,2020-05-07,15:55:00,Thursday,2020-05-07,15:24:00,Thursday


In [3]:
# this step turns out to be unnecessary, because we select the column we want to work with later

textdf = full_df[['manner_of_death', 'primary_cause', 'primary_cause_line_a', 
                  'primary_cause_line_b', 'primary_cause_line_c', 'secondary_cause']]
textdf.sample(5)

Unnamed: 0,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,primary_cause_line_c,secondary_cause
22328,ACCIDENT,COMPLICATIONS OF CLOSED HEAD INJURY(IES). FALL(S),COMPLICATIONS OF CLOSED HEAD INJURY(IES),FALL(S),no_text,no_text
459,SUICIDE,HANGING,HANGING,no_text,no_text,no_text
28873,ACCIDENT,COCAINE AND METHADONE TOXICITY,COCAINE AND METHADONE TOXICITY,no_text,no_text,BRONCHIAL ASTHMA
31674,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,no_text,no_text,no_text
32855,HOMICIDE,GUNSHOT WOUND TO BACK,GUNSHOT WOUND TO BACK,no_text,no_text,no_text


In [4]:
# make a stemmer and lemmatizer

wn = WordNetLemmatizer()
stemmer = PorterStemmer()

def lem_stem(text):
    return stemmer.stem(wn.lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lem_stem(token))
    return result

In [5]:
# test it's functioning as expected

doc_sample = textdf[textdf.index == 428].values[0][1]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['COMBINED', 'ETHANOL,', 'COCAINE,', 'FENTANYL,', 'ACETYL', 'FENTANYL,', 'DESPROPIONYL', 'FENTANYL(4-ANPP),', 'AND', 'HEROIN', 'TOXICITY']


 tokenized and lemmatized document: 
['combin', 'ethanol', 'cocain', 'fentanyl', 'acetyl', 'fentanyl', 'despropionyl', 'fentanyl', 'anpp', 'heroin', 'toxic']


In [6]:
# process the 'primary_cause' column

processed_docs = textdf['primary_cause'].map(preprocess)
processed_docs[:10]

0    [multipl, blunt, forc, injuri, motor, vehicl, ...
1                            [multipl, gunshot, wound]
2                                [gunshot, wind, head]
3                            [multipl, gunshot, wound]
4                            [multipl, gunshot, wound]
5    [multipl, injuri, bicyclist, strike, motor, ve...
6                            [multipl, gunshot, wound]
7    [multipl, injuri, scooter, motor, vehicl, collis]
8                               [gunshot, wind, chest]
9                            [multipl, gunshot, wound]
Name: primary_cause, dtype: object

## 🔥🔥🔥🔥🔥
How do we customize the part-of-speech for 'wound' so that it's always a noun and doesn't sometimes get lemmatized to 'wind'; also check if 'left' is becoming 'leav' 


In [7]:
# Create the Bag of Words (bow) for our data
# first, make a dictionary with words and number of times they occur in the corpus

dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
# dictionary.token2id

In [None]:
# dictionary.cfs

In [8]:
# filter out tokens that appear in too few or too many docs

# these were the example settings; 
# need to check if they work for our data   🔥🔥🔥

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [9]:
# make a dictionary for each document, 
# with how many words (by number of tuples) 
# and which word (based on its number in the dictionary)
# and how many times they appear

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# same example doc as above
bow_corpus[428]

[(26, 1),
 (56, 3),
 (57, 1),
 (63, 1),
 (67, 1),
 (82, 1),
 (83, 1),
 (116, 1),
 (124, 1)]

In [11]:
# printed out nicely

bow_doc_428 = bow_corpus[428]
for i in range(len(bow_doc_428)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_428[i][0], 
                                               dictionary[bow_doc_428[i][0]], 
bow_doc_428[i][1]))

Word 26 ("ethanol") appears 1 time(s).
Word 56 ("fentanyl") appears 3 time(s).
Word 57 ("toxic") appears 1 time(s).
Word 63 ("cocain") appears 1 time(s).
Word 67 ("combin") appears 1 time(s).
Word 82 ("anpp") appears 1 time(s).
Word 83 ("despropionyl") appears 1 time(s).
Word 116 ("heroin") appears 1 time(s).
Word 124 ("acetyl") appears 1 time(s).


In [12]:
# number in dictionary, word, times it appears on document
bow_doc_428[0][0], dictionary[bow_doc_428[0][0]], bow_doc_428[0][1]

(26, 'ethanol', 1)

In [13]:
# create tf-idf model and transform the corpus

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [14]:
# LDA run on bag of words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, 
                                       id2word=dictionary, passes=2, workers=2)


# check the words in each topic and their weights to see if they make sense

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.206*"multipl" + 0.170*"wound" + 0.165*"gunshot" + 0.071*"hang" + 0.053*"injuri" + 0.049*"fall" + 0.039*"diseas" + 0.036*"cardiovascular" + 0.033*"hypertens" + 0.026*"head"
Topic: 1 
Words: 0.190*"fentanyl" + 0.182*"toxic" + 0.115*"heroin" + 0.104*"combin" + 0.075*"drug" + 0.069*"cocain" + 0.055*"ethanol" + 0.052*"despropionyl" + 0.037*"anpp" + 0.018*"acetyl"
Topic: 2 
Words: 0.129*"asphyxia" + 0.116*"drown" + 0.092*"food" + 0.092*"choke" + 0.077*"bolu" + 0.045*"incis" + 0.044*"stab" + 0.039*"asphyxi" + 0.036*"wound" + 0.032*"multipl"
Topic: 3 
Words: 0.188*"toxic" + 0.121*"combin" + 0.106*"drug" + 0.061*"alprazolam" + 0.042*"hydrocodon" + 0.038*"methadon" + 0.030*"ethanol" + 0.028*"clonazepam" + 0.027*"fentanyl" + 0.025*"diphenhydramin"
Topic: 4 
Words: 0.151*"injuri" + 0.087*"head" + 0.067*"forc" + 0.061*"blunt" + 0.060*"fall" + 0.045*"multipl" + 0.043*"close" + 0.038*"inhal" + 0.029*"monoxid" + 0.029*"carbon"
Topic: 5 
Words: 0.184*"fall" + 0.169*"complic" + 0.082*

In [17]:
# check where sample doc would be classified in bag of words model
print(processed_docs[428])

for index, score in sorted(lda_model[bow_corpus[428]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

['combin', 'ethanol', 'cocain', 'fentanyl', 'acetyl', 'fentanyl', 'despropionyl', 'fentanyl', 'anpp', 'heroin', 'toxic']

Score: 0.9249980449676514	 
Topic: 0.190*"fentanyl" + 0.182*"toxic" + 0.115*"heroin" + 0.104*"combin" + 0.075*"drug" + 0.069*"cocain" + 0.055*"ethanol" + 0.052*"despropionyl" + 0.037*"anpp" + 0.018*"acetyl"


In [18]:
# LDA run on tfidf

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, 
                                             id2word=dictionary, passes=2, workers=4)

# check the words in each topic and their weights to see if they make sense

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.082*"complic" + 0.064*"fractur" + 0.061*"fall" + 0.061*"cocain" + 0.056*"opiat" + 0.050*"hemorrhag" + 0.043*"toxic" + 0.032*"right" + 0.031*"stab" + 0.031*"leav"
Topic: 1 Word: 0.140*"fall" + 0.085*"complic" + 0.084*"close" + 0.078*"injuri" + 0.073*"acut" + 0.056*"head" + 0.041*"asphyxi" + 0.037*"torso" + 0.037*"stair" + 0.029*"multipl"
Topic: 2 Word: 0.214*"heroin" + 0.128*"toxic" + 0.060*"fentanyl" + 0.041*"cocain" + 0.036*"combin" + 0.028*"drug" + 0.021*"probabl" + 0.017*"despropionyl" + 0.016*"ethanol" + 0.014*"smoke"
Topic: 3 Word: 0.052*"abdomen" + 0.041*"fentanyl" + 0.038*"diphenhydramin" + 0.038*"blunt" + 0.037*"wind" + 0.032*"head" + 0.030*"forc" + 0.029*"shotgun" + 0.027*"trauma" + 0.026*"toxic"
Topic: 4 Word: 0.300*"wind" + 0.207*"gunshot" + 0.198*"head" + 0.078*"chest" + 0.024*"neck" + 0.022*"intraor" + 0.020*"methadon" + 0.010*"complic" + 0.010*"toxic" + 0.009*"fentanyl"
Topic: 5 Word: 0.096*"injuri" + 0.085*"vehicl" + 0.085*"motor" + 0.066*"multipl" + 0.0

In [19]:
# check where sample doc would be classified in tfidf model
print(processed_docs[428])

for index, score in sorted(lda_model_tfidf[bow_corpus[428]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

['combin', 'ethanol', 'cocain', 'fentanyl', 'acetyl', 'fentanyl', 'despropionyl', 'fentanyl', 'anpp', 'heroin', 'toxic']

Score: 0.9249950647354126	 
Topic: 0.175*"fentanyl" + 0.093*"toxic" + 0.086*"combin" + 0.076*"despropionyl" + 0.075*"drug" + 0.071*"cocain" + 0.065*"ethanol" + 0.059*"anpp" + 0.057*"heroin" + 0.037*"acetyl"
