Author: Susan Hopper

## refined LDA Topic Modeling on full primary cause
Credit where credit is due! Based on the work of Selva Prabhakaran, https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [None]:
# In CLI:
# pip install pyLDAvis

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

import pickle

In [2]:
# Read in the cleaned data
df = pd.read_csv('../1_data/cleaned_data_31OCT.csv')

In [None]:
df.head()

In [3]:
# Define a funcion to stem the words
# I'm not lemmatizing b/c we don't have many verbs
# and when I lemmatized before it messed up non-verbs commonly found in our dataset, 
# like turning 'wound' into 'wind' and 'left' into 'leav'

stop_words = stopwords.words('english')
ps = PorterStemmer()

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 2:
            result.append(ps.stem(token))
    return result

In [None]:
# Test it's functioning as expected

doc_sample = df[df.index == 10_001].values[0][8]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

print('\nstemmed document: ')
print(preprocess(doc_sample))

In [4]:
# Process the 'primary_cause' column

data_words = df['primary_cause'].map(preprocess).tolist()

In [5]:
# Build bigram model and function, and make bigrams
# I also tried trigrams but bigrams worked better

bigram = gensim.models.Phrases(data_words, min_count=1, threshold=1) 

bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

data_words_bigrams = make_bigrams(data_words)

data_words[:10]

[['multipl', 'blunt', 'forc', 'injuri', 'motor', 'vehicl', 'collis'],
 ['multipl', 'gunshot', 'wound'],
 ['gunshot', 'wound', 'head'],
 ['multipl', 'gunshot', 'wound'],
 ['multipl', 'gunshot', 'wound'],
 ['multipl', 'injuri', 'bicyclist', 'struck', 'motor', 'vehicl'],
 ['multipl', 'gunshot', 'wound'],
 ['multipl', 'injuri', 'scooter', 'motor', 'vehicl', 'collis'],
 ['gunshot', 'wound', 'chest'],
 ['multipl', 'gunshot', 'wound']]

In [6]:
# Create dictionary for the model
id2word = corpora.Dictionary(data_words_bigrams)

# Create corpus for the model
texts = data_words_bigrams

# Find term document frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Check the processing with human-readable format of corpus (term, frequency in doc)

[[(id2word[id], freq) for id, freq in cp] for cp in corpus[25:30]]

In [7]:
# Build LDA model
# I experimented with num_topics, chunk_size, & passes
# This version gave my best results based on coherence score

lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=3000,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)


In [None]:
# Print keywords in each topic
pprint(lda_model2.print_topics())
doc_lda = lda_model2[corpus]

In [8]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model2, corpus, id2word)

vis

In [None]:
# Compute perplexity, a measure of how good the model is. Lower is better.
print('\nPerplexity: ', lda_model2.log_perplexity(corpus))  

# Compute coherence score, a measure of how well the elements of the topic support each other. 
# Cohesion relates to human comprehension better than perplexity
coherence_model_lda = CoherenceModel(model=lda_model2, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Thanks, Tim!
lda_model2.get_document_topics(corpus[0])

In [None]:
# Define a function to find the most relevant topic
def best_topic(corpus):
    best_topic_list = []
    for i in range(0, len(corpus)):
        best_topic = (sorted(lda_model2.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True))[0][0]
        best_topic_list.append(best_topic)
    return best_topic_list

best_topic_column = best_topic(corpus)

In [None]:
# Define a function to get the percentage for the most relevant topic
def topic_perc(corpus):
    topic_perc_list = []
    for i in range(0, len(corpus)):
        perc_topic = (sorted(lda_model2.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True))[0][1]
        topic_perc_list.append(perc_topic)
    return topic_perc_list

topic_perc_column = topic_perc(corpus)

In [None]:
# Add topic columns to the df
df['long_topic'] = lda_model2.get_document_topics(corpus)
df['best_topic_num'] = best_topic_column
df['best_topic_name'] = df['best_topic_num'].map(
                                                {0:'one_gunshot_wound', 
                                                 1:'gunshot_wounds_fall', 
                                                 2:'vehicle_collision', 
                                                 3:'drug_overdose', 
                                                 4:'miscellaneous'})
df['best_topic_perc'] = topic_perc_column

In [None]:
# Check how it looks
df[['primary_cause_line_a', 'primary_cause_line_b', 'long_topic', 'best_topic_num', 'best_topic_name','best_topic_perc', ]].sample(10)

In [None]:
# Save the df as a csv

df.to_csv('../1_data/df_with_topics.csv', index=False)

In [None]:
df2 = pd.read_csv('../1_data/df_with_topics.csv')
df2.tail()

In [8]:
# Pickle the model

with open('../5_pickled_models/topic_modeling.pkl', 'wb') as f:
    pickle.dump(lda_model2, f)