## topic modeling exploration

Credits:<br>
Ria Kulshrestha, https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2 <br>
Susan Li, https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import sys
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem.porter import *
import nltk
# nltk.download('wordnet') # only need to do once?


In [3]:
df = pd.read_csv('../susan/data/cleaned_data_31OCT.csv')
# df.sample(5)

In [16]:
# https://stackoverflow.com/questions/1987694/how-do-i-print-the-full-numpy-array-without-truncation
pd.set_option('display.max_rows', None)

df['primary_cause_line_b'].sample(500)


15391                                              no_text
11450                                              no_text
35363                    MOTOR VEHICLE STRIKING PEDESTRIAN
12044                                              no_text
25268                                              no_text
1948                                               HANGING
31109                                              no_text
36215                                                 FALL
11946                                                 FALL
16685                  ...PROBABLE FLUALPRAZOLAM) TOXICITY
10532                                              no_text
3998                                               no_text
33844                       CHRONIC ETHANOL AND DRUG ABUSE
1068                                         PROBABLE FALL
15131                                              no_text
2693                                               no_text
11431                             UNSAFE SLEEP ENVIRONME

In [None]:

np.set_printoptions(threshold = False) 

In [None]:
# this step turns out to be unnecessary, because we select the column we want to work with later

textdf = full_df[['manner_of_death', 'primary_cause', 'primary_cause_line_a', 
                  'primary_cause_line_b', 'primary_cause_line_c', 'secondary_cause']]
textdf.sample(5)

In [None]:
# make a stemmer and lemmatizer

wn = WordNetLemmatizer()
stemmer = PorterStemmer()

def lem_stem(text):
    return stemmer.stem(wn.lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lem_stem(token))
    return result

In [None]:
# test it's functioning as expected

doc_sample = textdf[textdf.index == 428].values[0][1]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
# process the 'primary_cause' column

processed_docs = textdf['primary_cause'].map(preprocess)
processed_docs[:10]

## 🔥🔥🔥🔥🔥
How do we customize the part-of-speech for 'wound' so that it's always a noun and doesn't sometimes get lemmatized to 'wind'; also check if 'left' is becoming 'leav' 


In [None]:
# Create the Bag of Words (bow) for our data
# first, make a dictionary with words and number of times they occur in the corpus

dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
# dictionary.token2id

In [None]:
# dictionary.cfs

In [None]:
# filter out tokens that appear in too few or too many docs

# these were the example settings; 
# need to check if they work for our data   🔥🔥🔥

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
# make a dictionary for each document, 
# with how many words (by number of tuples) 
# and which word (based on its number in the dictionary)
# and how many times they appear

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# same example doc as above
bow_corpus[428]

In [None]:
# printed out nicely

bow_doc_428 = bow_corpus[428]
for i in range(len(bow_doc_428)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_428[i][0], 
                                               dictionary[bow_doc_428[i][0]], 
bow_doc_428[i][1]))

In [None]:
# number in dictionary, word, times it appears on document
bow_doc_428[0][0], dictionary[bow_doc_428[0][0]], bow_doc_428[0][1]

In [None]:
# create tf-idf model and transform the corpus

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
# LDA run on bag of words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, 
                                       id2word=dictionary, passes=2, workers=2)


# check the words in each topic and their weights to see if they make sense

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# check where sample doc would be classified in bag of words model
print(processed_docs[428])

for index, score in sorted(lda_model[bow_corpus[428]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

In [None]:
# LDA run on tfidf

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, 
                                             id2word=dictionary, passes=2, workers=4)

# check the words in each topic and their weights to see if they make sense

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
# check where sample doc would be classified in tfidf model
print(processed_docs[428])

for index, score in sorted(lda_model_tfidf[bow_corpus[428]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))