## refined LDA Topic Modeling on full primary cause
Credit where credit is due! Selva Prabhakaran, https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [None]:
# in CLI:
# pip install spacy
# pip install pyLDAvis
# python3 -m spacy download 'en_core_web_sm'

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer


import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
df = pd.read_csv('../susan/data/cleaned_data_31OCT.csv')

In [None]:
# stem the words
# I'm not lemmatizing b/c we don't have many verbs
# and when iI tried before it messed up non-verbs commonly found in our dataset, 
# like turning 'wound' into 'wind' and 'left' into 'leav'

stop_words = stopwords.words('english')
ps = PorterStemmer()

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 2:
            result.append(ps.stem(token))
    return result

In [None]:
# test it's functioning as expected

doc_sample = df[df.index == 10_001].values[0][8]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

print('\nstemmed document: ')
print(preprocess(doc_sample))

In [None]:
# process the 'primary_cause' column

data_words = df['primary_cause'].map(preprocess).tolist()

#data_words = preproc.tolist()

#print(data_words[:15])

In [None]:
# Build bigram and trigram models
# bigrams worked better so I didn't end up using trigrams

bigram = gensim.models.Phrases(data_words, min_count=1, threshold=1) 
trigram = gensim.models.Phrases(bigram[data_words], threshold=1)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [None]:
# Build bigram and trigram functions

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
# Make bigrams and trigram
data_words_bigrams = make_bigrams(data_words)
data_words_trigrams = make_trigrams(data_words) # bigrams may be more useful

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# check the processing with human-readable format of corpus (term, frequency in doc)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[25:30]]

In [None]:
# # Build the LDA model 1

# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=4,
#                                            random_state=42,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha='auto',
#                                            per_word_topics=True)


In [None]:
# # Print keywords in each topic
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

In [None]:
# # Compute perplexity, a measure of how good the model is. The lower the better.

# print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# # Compute coherence score, a measure of how well the elements of the topic support each other

# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', coherence_lda)

In [None]:
# Build LDA model 2

lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=3000,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)


In [None]:
# Print keywords in each topic
pprint(lda_model2.print_topics())
doc_lda = lda_model2[corpus]

In [None]:
# Visualize the topics

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model2, corpus, id2word)

vis

In [None]:
# Compute perplexity, a measure of how good the model is. The lower the better.

print('\nPerplexity: ', lda_model2.log_perplexity(corpus))  

# Compute coherence score, a measure of how well the elements of the topic support each other

coherence_model_lda = CoherenceModel(model=lda_model2, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
lda_model2.get_document_topics(corpus[0])

In [None]:
df.head(1)

In [None]:
def best_topic_finder(corpus):
    best_topic_tuples = []
    for i in range(0, len(corpus)):
        best_topic = (sorted(lda_model2.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True))[0][0]
        perc_topic = (sorted(lda_model2.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True))[0][1]
        best_topic_tuple = (best_topic, perc_topic)
        best_topic_tuples.append(best_topic_tuple)
    return best_topic_tuples


In [None]:
def best_topic(corpus):
    best_topic_list = []
    for i in range(0, len(corpus)):
        best_topic = (sorted(lda_model2.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True))[0][0]
        best_topic_list.append(best_topic)
    return best_topic_list

best_topic_column = best_topic(corpus)

In [None]:
def topic_perc(corpus):
    topic_perc_list = []
    for i in range(0, len(corpus)):
        perc_topic = (sorted(lda_model2.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True))[0][1]
        topic_perc_list.append(perc_topic)
    return topic_perc_list

topic_perc_column = topic_perc(corpus)

In [None]:
df['long_topic'] = lda_model2.get_document_topics(corpus)
df.head(1)

In [None]:
df['best_topic_num'] = best_topic_column

In [None]:
df['best_topic_name'] = df['best_topic_num'].map({0:'gunshot_wound_suicide', 1:'gunshot_wounds_homicide', 2:'vehicle_collision', 3:'drug_overdose', 4:'miscellaneous'})

In [None]:
df['best_topic_perc'] = topic_perc_column

In [None]:
df.columns

In [None]:
df[['primary_cause_line_a', 'primary_cause_line_b', 'long_topic', 'best_topic', 'best_topic_name','best_topic_perc', ]].sample(10)

In [None]:
# Credit: https://stackoverflow.com/questions/70295773/extract-topic-scores-for-documents-lda-gensim-python

# Find dominant topic in each doc

##dominant topic for each document
def format_topics_sentences(ldamodel=lda_model, 
                            corpus=corpus, 
                            texts=data_words, 
                            n=1):
    """
    A function for extracting a number of dominant topics for a given document
    using an existing LDA model
    """
    # Init output
    sent_topics_df = pd.DataFrame()


    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            # we use range here to iterate over the n parameter
            if j in range(n):  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    # and also use the i value here to get the document label
                    pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords]),
                    ignore_index=True,
                )
            else:
                break
    sent_topics_df.columns = ["Document", "Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]
    contents = pd.Series(text_col, name='original_texts')
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_words, n=1)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
list(lda_model[corpus])

In [None]:
from gensim.test.utils import common_texts, common_corpus, common_dictionary
from gensim.models import LdaModel

# train a quick lda model using the common _corpus, _dictionary and _texts from gensim
optimal_model = LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)

In [None]:
common_texts

In [None]:
data_words_bigrams[:10]

In [None]:
# Credit: https://stackoverflow.com/questions/70295773/extract-topic-scores-for-documents-lda-gensim-python

##dominant topic for each document
def format_topics_sentences(ldamodel=optimal_model, 
                            corpus=common_corpus, 
                            texts=common_texts, 
                            n=1):
    """
    A function for extracting a number of dominant topics for a given document
    using an existing LDA model
    """
    # Init output
    sent_topics_df = pd.DataFrame()


    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            # we use range here to iterate over the n parameter
            if j in range(n):  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, 
                                            (pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords])).to_frame().T], 
                                           ignore_index=True)
            else:
                break
                
    sent_topics_df.columns = ["Document", "Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]
    contents = pd.Series(text_col, name='original_texts')
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df


In [None]:
format_topics_sentences(ldamodel=optimal_model, corpus=common_corpus, texts=common_texts, n=1)

In [None]:
##dominant topic for each document
def format_topics_sentences(ldamodel=lda_model, 
                            corpus=corpus, 
                            texts=data_words_bigrams, 
                            n=1):
    """
    A function for extracting a number of dominant topics for a given document
    using an existing LDA model
    """
    # Init output
    sent_topics_df = pd.DataFrame()


    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            # we use range here to iterate over the n parameter
            if j in range(n):  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, 
                                            (pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords])).to_frame().T], 
                                           ignore_index=True)
            else:
                break
                
    sent_topics_df.columns = ["Document", "Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]
    contents = pd.Series(text_col, name='original_texts')
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df


In [None]:
format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_words_bigrams, n=1)