## refined LDA Topic Modeling on full primary cause
Credit: Selva Prabhakaran, https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [None]:
# in CLI:
# pip install spacy
# pip install pyLDAvis
# python3 -m spacy download 'en_core_web_sm'

In [13]:
# import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

# import re
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# import warnings
# warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
df = pd.read_csv('../susan/data/cleaned_data_31OCT.csv')

In [3]:
# make a stemmer

stop_words = stopwords.words('english')
ps = PorterStemmer()

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 2:
            result.append(ps.stem(token))
    return result

In [4]:
# test it's functioning as expected

doc_sample = df[df.index == 4000].values[0][8]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

print('\nstemmed document: ')
print(preprocess(doc_sample))

original document: 
['GUNSHOT', 'WOUND', 'OF', 'THE', 'HEAD']

stemmed document: 
['gunshot', 'wound', 'head']


In [6]:
# process the 'primary_cause' column

data_words = df['primary_cause'].map(preprocess).tolist()

#data_words = preproc.tolist()

print(data_words[:12])

[['multipl', 'blunt', 'forc', 'injuri', 'motor', 'vehicl', 'collis'], ['multipl', 'gunshot', 'wound'], ['gunshot', 'wound', 'head'], ['multipl', 'gunshot', 'wound'], ['multipl', 'gunshot', 'wound'], ['multipl', 'injuri', 'bicyclist', 'struck', 'motor', 'vehicl'], ['multipl', 'gunshot', 'wound'], ['multipl', 'injuri', 'scooter', 'motor', 'vehicl', 'collis'], ['gunshot', 'wound', 'chest'], ['multipl', 'gunshot', 'wound'], ['multipl', 'injuri', 'jump', 'height'], ['gunshot', 'wound', 'head']]


In [7]:
# Build bigram and trigram models

bigram = gensim.models.Phrases(data_words, min_count=1, threshold=1) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=1)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [8]:
# Build bigram and trigram functions

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [9]:
# Make bigrams and trigram
data_words_bigrams = make_bigrams(data_words)
data_words_trigrams = make_trigrams(data_words) # bigrams may be more useful?

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_trigrams)

# Create Corpus
texts = data_words_trigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [11]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[30:40]]

[[('multipl', 1), ('injuri_train_strike', 1), ('pedestrian', 1)],
 [('blunt_forc_injuri', 1), ('fall', 1), ('head', 1)],
 [('multipl_gunshot_wound', 1)],
 [('gunshot_wound_head', 1)],
 [('blunt_forc_injuri', 1), ('fall', 1), ('head', 1)],
 [('gunshot_wound_chest', 1)],
 [('blunt_forc_injuri', 1),
  ('auger', 1),
  ('cement', 1),
  ('entangl', 1),
  ('lower_extrem', 1)],
 [('blunt_forc_injuri', 1), ('fall', 1), ('head', 1), ('dump_truck', 1)],
 [('choke_food_bolu', 1)],
 [('gunshot_wound_chest', 1)]]

In [23]:
# Build the LDA model

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [14]:
# Print keywords in each topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.076*"multipl" + 0.073*"hang" + 0.069*"close_head_injuri" + 0.069*"acut" + '
  '0.028*"asphyxia_hang" + 0.026*"unwit_fall" + 0.017*"injuri_fall_height" + '
  '0.016*"neck" + 0.014*"gunshot_wound_torso" + 0.013*"fractur_fall"'),
 (1,
  '0.277*"toxic" + 0.164*"heroin" + 0.063*"combin_drug" + 0.063*"cocain" + '
  '0.053*"ethanol" + 0.049*"fentanyl" + '
  '0.031*"hypertens_cardiovascular_diseas" + 0.022*"asphyxi_hang" + '
  '0.022*"combin" + 0.018*"methadon"'),
 (2,
  '0.225*"multipl_gunshot_wound" + 0.136*"gunshot_wound_head" + 0.096*"intox" '
  '+ 0.072*"multipl_injuri_motor" + 0.051*"alcohol" + 0.040*"complic" + '
  '0.037*"opiat" + 0.037*"vehicl_collis" + 0.037*"vehicl_strike_pedestrian" + '
  '0.027*"acut_ethanol"'),
 (3,
  '0.093*"multipl" + 0.091*"fall" + 0.088*"complic" + 0.060*"injuri" + '
  '0.031*"gunshot_wound_chest" + 0.030*"blunt_forc_injuri" + 0.028*"head" + '
  '0.019*"carbon_monoxid" + 0.017*"pedestrian" + 0.017*"gunshot_wound_back"')]


In [24]:
# Compute perplexity, a measure of how good the model is. The lower the better.

print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# Compute coherence score

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_trigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -4.795527700044841

Coherence Score:  0.6295936623765793


In [25]:
# Visualize the topics

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
# Credit: https://stackoverflow.com/questions/70295773/extract-topic-scores-for-documents-lda-gensim-python

# Find dominant topic in each doc

##dominant topic for each document
def format_topics_sentences(ldamodel=lda_model, 
                            corpus=corpus, 
                            texts=data_words, 
                            n=1):
    """
    A function for extracting a number of dominant topics for a given document
    using an existing LDA model
    """
    # Init output
    sent_topics_df = pd.DataFrame()


    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            # we use range here to iterate over the n parameter
            if j in range(n):  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    # and also use the i value here to get the document label
                    pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords]),
                    ignore_index=True,
                )
            else:
                break
    sent_topics_df.columns = ["Document", "Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]
    contents = pd.Series(text_col, name='original_texts')
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_words, n=1)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
list(lda_model[corpus])

In [None]:
from gensim.test.utils import common_texts, common_corpus, common_dictionary
from gensim.models import LdaModel

# train a quick lda model using the common _corpus, _dictionary and _texts from gensim
optimal_model = LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)

In [None]:
common_texts

In [None]:
data_words_bigrams[:10]

In [None]:
# Credit: https://stackoverflow.com/questions/70295773/extract-topic-scores-for-documents-lda-gensim-python

##dominant topic for each document
def format_topics_sentences(ldamodel=optimal_model, 
                            corpus=common_corpus, 
                            texts=common_texts, 
                            n=1):
    """
    A function for extracting a number of dominant topics for a given document
    using an existing LDA model
    """
    # Init output
    sent_topics_df = pd.DataFrame()


    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            # we use range here to iterate over the n parameter
            if j in range(n):  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, 
                                            (pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords])).to_frame().T], 
                                           ignore_index=True)
            else:
                break
                
    sent_topics_df.columns = ["Document", "Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]
    contents = pd.Series(text_col, name='original_texts')
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df


In [None]:
format_topics_sentences(ldamodel=optimal_model, corpus=common_corpus, texts=common_texts, n=1)

In [None]:
##dominant topic for each document
def format_topics_sentences(ldamodel=lda_model, 
                            corpus=corpus, 
                            texts=data_words_bigrams, 
                            n=1):
    """
    A function for extracting a number of dominant topics for a given document
    using an existing LDA model
    """
    # Init output
    sent_topics_df = pd.DataFrame()


    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            # we use range here to iterate over the n parameter
            if j in range(n):  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, 
                                            (pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords])).to_frame().T], 
                                           ignore_index=True)
            else:
                break
                
    sent_topics_df.columns = ["Document", "Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]
    contents = pd.Series(text_col, name='original_texts')
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df


In [None]:
format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_words_bigrams, n=1)