In [1]:
import pandas as pd
import spacy
from time import time
from spacy.language import Language

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("merge_noun_chunks")
nlp.add_pipe("merge_entities")

# @Language.component("remove_stopwords")
# def remove_stopwords(doc):
#     # This will remove stopwords and punctuation.
#     # Use token.text to return strings, which we'll need for Gensim.
#     doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
#     return doc

#nlp.add_pipe("remove_stopwords")

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [2]:
preprocess_start = time()
df = pd.read_csv('corpus_sample/corpus_10k.csv')

In [3]:
body_text = df.loc[:, 'body_text']

emails = []
include_named_entities = False

for email in list(nlp.pipe(body_text)):
    if include_named_entities == True:
        emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha()])
    else:
        emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])

# for text in body_text:
#     email = nlp(text)
#     if include_named_entities == True:
#         emails.append([token.text.lower() for token in email if not token.is_stop and token.text.isalpha()])
#     else:
#         emails.append([token.text.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])


In [4]:
# for email in list(nlp.pipe(body_text[0:3])):
#     for token in email:
#         print(token.text, token.lemma_,token.pos_)

In [5]:
#print(emails[0:2])

In [6]:
import gensim
import gensim.corpora as corpora
# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(emails)

# Filter out words that occur less than 10 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.5)


corpus = [dictionary.doc2bow(doc) for doc in emails]

In [7]:
preprocess_end = time()
print("Time to pre-process corpus: " + str(preprocess_end - preprocess_start))

Time to pre-process corpus: 906.225252866745


In [8]:
print(dictionary)

Dictionary<2703 unique tokens: ['ad', 'ally', 'anymore', 'ask', 'check']...>


In [9]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 2703
Number of documents: 9978


In [10]:
# Enable logging to see the progress of training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [11]:
lda_start = time()

In [12]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 20
chunksize = 10000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2023-06-22 15:51:04,793 : INFO : using autotuned alpha, starting with [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
2023-06-22 15:51:04,794 : INFO : using serial LDA version on this node
2023-06-22 15:51:04,797 : INFO : running online (multi-pass) LDA training, 20 topics, 20 passes over the supplied corpus of 9978 documents, updating model once every 9978 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2023-06-22 15:51:04,798 : INFO : PROGRESS: pass 0, at document #9978/9978
2023-06-22 15:51:12,262 : INFO : optimized alpha [0.04377404, 0.03926013, 0.039902076, 0.043329056, 0.042103216, 0.04066257, 0.040805425, 0.03968541, 0.041507225, 0.040424135, 0.041632965, 0.043468017, 0.040006954, 0.042437263, 0.040900964, 0.04550194, 0.03953081, 0.043827605, 0.04021289, 0.03976245]
2023-06-22 15:51:12,265 : INFO : topic #1 (0.039): 0.020*"know" + 0.016*"chip" + 0.

2023-06-22 15:51:26,197 : INFO : topic #2 (0.031): 0.041*"elect" + 0.029*"victory" + 0.026*"change" + 0.021*"end" + 0.018*"win" + 0.017*"take" + 0.016*"raise" + 0.016*"work" + 0.015*"afford" + 0.015*"dedicate"
2023-06-22 15:51:26,198 : INFO : topic #7 (0.032): 0.082*"contribute" + 0.018*"reach" + 0.018*"deductible" + 0.017*"know" + 0.016*"right" + 0.016*"authorize" + 0.013*"date" + 0.013*"follow" + 0.013*"information" + 0.012*"contribution"
2023-06-22 15:51:26,198 : INFO : topic #15 (0.038): 0.062*"chip" + 0.040*"elect" + 0.019*"protect" + 0.017*"right" + 0.016*"ameripac" + 0.016*"immediately" + 0.014*"go" + 0.013*"let" + 0.012*"stay" + 0.012*"pass"
2023-06-22 15:51:26,198 : INFO : topic #13 (0.038): 0.023*"stop" + 0.020*"text" + 0.015*"reach" + 0.014*"visit" + 0.013*"stand" + 0.012*"make" + 0.011*"fight" + 0.010*"go" + 0.010*"know" + 0.009*"authorize"
2023-06-22 15:51:26,198 : INFO : topic #4 (0.044): 0.042*"flip" + 0.024*"right" + 0.024*"win" + 0.020*"match" + 0.018*"defeat" + 0.017*

2023-06-22 15:51:36,860 : INFO : topic #12 (0.038): 0.011*"say" + 0.011*"continue" + 0.011*"work" + 0.010*"people" + 0.009*"pandemic" + 0.008*"provide" + 0.008*"sign" + 0.008*"month" + 0.007*"include" + 0.007*"protect"
2023-06-22 15:51:36,860 : INFO : topic #4 (0.049): 0.046*"flip" + 0.025*"right" + 0.025*"win" + 0.021*"rush" + 0.018*"defeat" + 0.017*"chip" + 0.016*"match" + 0.015*"fight" + 0.015*"raise" + 0.014*"save"
2023-06-22 15:51:36,861 : INFO : topic diff=0.411179, rho=0.288675
2023-06-22 15:51:36,863 : INFO : PROGRESS: pass 11, at document #9978/9978
2023-06-22 15:51:38,875 : INFO : optimized alpha [0.032360617, 0.03231739, 0.027487578, 0.030751692, 0.04993446, 0.037010044, 0.029945256, 0.02937665, 0.03064262, 0.02954888, 0.033795834, 0.030737596, 0.037752897, 0.03526087, 0.030370131, 0.034396816, 0.03202865, 0.03691532, 0.035210703, 0.031737242]
2023-06-22 15:51:38,878 : INFO : topic #2 (0.027): 0.045*"elect" + 0.033*"victory" + 0.029*"change" + 0.022*"end" + 0.019*"win" + 0.0

2023-06-22 15:51:49,014 : INFO : topic #2 (0.026): 0.047*"elect" + 0.034*"victory" + 0.030*"change" + 0.022*"end" + 0.020*"win" + 0.019*"take" + 0.018*"work" + 0.018*"raise" + 0.018*"afford" + 0.017*"dedicate"
2023-06-22 15:51:49,014 : INFO : topic #7 (0.028): 0.095*"contribute" + 0.020*"deductible" + 0.020*"reach" + 0.018*"authorize" + 0.017*"know" + 0.016*"follow" + 0.016*"date" + 0.016*"right" + 0.015*"information" + 0.014*"contribution"
2023-06-22 15:51:49,015 : INFO : topic #18 (0.038): 0.039*"fight" + 0.027*"work" + 0.022*"run" + 0.018*"long" + 0.017*"chip" + 0.016*"wish" + 0.015*"know" + 0.015*"believe" + 0.012*"bring" + 0.012*"right"
2023-06-22 15:51:49,015 : INFO : topic #12 (0.039): 0.012*"say" + 0.011*"continue" + 0.011*"work" + 0.010*"people" + 0.009*"pandemic" + 0.009*"sign" + 0.008*"provide" + 0.008*"month" + 0.008*"add" + 0.007*"include"
2023-06-22 15:51:49,016 : INFO : topic #4 (0.055): 0.047*"flip" + 0.026*"right" + 0.026*"win" + 0.023*"rush" + 0.018*"defeat" + 0.017*"

In [13]:
lda_end = time()
print("LDA run time: " + str(lda_end - lda_start))

LDA run time: 50.08160090446472


In [14]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2023-06-22 15:51:54,889 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2023-06-22 15:51:54,898 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2023-06-22 15:51:54,919 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2023-06-22 15:51:54,946 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2023-06-22 15:51:54,975 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2023-06-22 15:51:54,985 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2023-06-22 15:51:54,995 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2023-06-22 15:51:55,006 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2023-06-22 15:51:55,017 : INFO : CorpusAccumulator accumulated stats from 9000 documents


Average topic coherence: -1.5345.
[([(0.04392661, 'train'),
   (0.03630747, 'fund'),
   (0.03459211, 'win'),
   (0.033670004, 'office'),
   (0.03271387, 'run'),
   (0.029354438, 'imagine'),
   (0.020280315, 'know'),
   (0.01734593, 'fight'),
   (0.017112425, 'listen'),
   (0.017024426, 'elect'),
   (0.016856035, 'url'),
   (0.01595696, 'redacted'),
   (0.015737917, 'try'),
   (0.015705047, 'facebook'),
   (0.015478491, 'follow'),
   (0.015438026, 'change'),
   (0.015366049, 'work'),
   (0.014988128, 'matter'),
   (0.01494832, 'promise'),
   (0.0149298115, 'authorize')],
  -1.0447195484669438),
 ([(0.06111176, 'pass'),
   (0.041266683, 'fund'),
   (0.032444593, 'win'),
   (0.028508503, 'end'),
   (0.02352675, 'know'),
   (0.022317277, 'run'),
   (0.021243405, 'elect'),
   (0.021089789, 'ask'),
   (0.02089256, 'build'),
   (0.020842994, 'fuel'),
   (0.020409662, 'push'),
   (0.0200249, 'spend'),
   (0.01959804, 'stop'),
   (0.019337589, 'give'),
   (0.019279612, 'fix'),
   (0.019254252, 

In [15]:
# dist = []
# for topic in lda_output:
#     topic_sum = 0
#     for word in topic:
#         topic_sum += word[0]
#     dist.append(topic_sum)

# normalizer = sum(dist)
    
# normalized = []
# for topic_dist in dist:
#     normalized_topic = topic_dist / normalizer
#     normalized.append(normalized_topic)
    
# print(dist)


In [16]:
# import matplotlib.pyplot as plt

# # Create a list of indices for the x-axis
# x = range(num_topics)
# x_labels = ['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10']

# # Plotting the bar graph
# plt.bar(x, normalized, tick_label=x_labels)

# # Add labels and title
# plt.xlabel('Topics')
# plt.ylabel('Probability')
# plt.title('Topic Probability Distribution')

# # Display the plot
# plt.show()