In [1]:
import pandas as pd
import spacy
import pickle
from time import time
from spacy.language import Language

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("merge_noun_chunks")
nlp.add_pipe("merge_entities")
nlp.Defaults.stop_words |= {"alex", "unsubscribe"}

# @Language.component("remove_stopwords")
# def remove_stopwords(doc):
#     # This will remove stopwords and punctuation.
#     # Use token.text to return strings, which we'll need for Gensim.
#     doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
#     return doc

#nlp.add_pipe("remove_stopwords")

In [2]:
preprocess_start = time()
df = pd.read_csv('corpus_sample/corpus_10k.csv')

In [3]:
body_text = df.loc[:, 'body_text']

emails = []
include_named_entities = True

for email in list(nlp.pipe(body_text)):
    if include_named_entities:
        emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha() and len(token.text) > 2])
    else:
        emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])

# for text in body_text:
#     email = nlp(text)
#     if include_named_entities == True:
#         emails.append([token.text.lower() for token in email if not token.is_stop and token.text.isalpha()])
#     else:
#         emails.append([token.text.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])


In [4]:
# for email in list(nlp.pipe(body_text[0:3])):
#     for token in email:
#         print(token.text, token.lemma_,token.pos_)

In [5]:
#print(emails[0:2])

In [6]:
import gensim
import gensim.corpora as corpora
# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(emails)

# Filter out words that occur less than 20 documents, or more than 30% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.3)


corpus = [dictionary.doc2bow(doc) for doc in emails]

In [7]:
with open("dictionary", "wb") as d:
    pickle.dump(dictionary, d)
    
with open("corpus", "wb") as c:
    pickle.dump(corpus, c)

In [8]:
with open("dictionary", "rb") as d:
    dictionary = pickle.load(d)
    
with open("corpus", "rb") as c:
    corpus = pickle.load(c)

In [9]:
preprocess_end = time()
print("Time to pre-process corpus: " + str(preprocess_end - preprocess_start))

Time to pre-process corpus: 893.4820468425751


In [10]:
print(dictionary)

Dictionary<1984 unique tokens: ['ad', 'ally', 'anymore', 'ask', 'check']...>


In [11]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1984
Number of documents: 9978


In [12]:
# Enable logging to see the progress of training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [13]:
lda_start = time()

In [14]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 100
chunksize = 10000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2023-06-26 20:34:56,999 : INFO : using autotuned alpha, starting with [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
2023-06-26 20:34:57,001 : INFO : using serial LDA version on this node
2023-06-26 20:34:57,034 : INFO : running online (multi-pass) LDA training, 100 topics, 20 passes over the supplied corpus of 9978 documents, updating model once every 9978 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence 

2023-06-26 20:35:42,795 : INFO : topic #77 (0.010): 0.034*"pass" + 0.028*"end" + 0.024*"trump" + 0.022*"match" + 0.021*"fund" + 0.015*"build" + 0.015*"lose" + 0.013*"save" + 0.013*"fuel" + 0.013*"rely"
2023-06-26 20:35:42,797 : INFO : topic diff=0.679684, rho=0.500000
2023-06-26 20:35:42,826 : INFO : PROGRESS: pass 3, at document #9978/9978
2023-06-26 20:35:50,307 : INFO : optimized alpha [0.009413505, 0.009359614, 0.009466845, 0.009216612, 0.009439931, 0.009416507, 0.009391321, 0.009360759, 0.009381291, 0.009331255, 0.009532294, 0.009436897, 0.009231319, 0.009705642, 0.00943398, 0.009605621, 0.009259813, 0.009222157, 0.009392965, 0.0096251555, 0.009678023, 0.0093541145, 0.0095505025, 0.009218093, 0.009442162, 0.009618705, 0.009348541, 0.00982462, 0.009255503, 0.009590392, 0.009527661, 0.009454323, 0.009328378, 0.009390037, 0.009672794, 0.009572197, 0.009255001, 0.009176815, 0.0093189245, 0.0091952495, 0.009570838, 0.009399538, 0.009321944, 0.009232946, 0.009366454, 0.00945436, 0.00938

2023-06-26 20:36:12,745 : INFO : topic #23 (0.009): 0.024*"texas" + 0.018*"come" + 0.014*"count" + 0.011*"sign" + 0.011*"make" + 0.010*"join" + 0.009*"turn" + 0.009*"voter" + 0.009*"hope" + 0.009*"sure"
2023-06-26 20:36:12,746 : INFO : topic #28 (0.009): 0.061*"trump" + 0.025*"defeat" + 0.022*"republicans" + 0.019*"lose" + 0.012*"think" + 0.012*"read" + 0.012*"away" + 0.011*"take" + 0.011*"raise" + 0.011*"change"
2023-06-26 20:36:12,748 : INFO : topic #15 (0.010): 0.068*"contribution" + 0.068*"tax" + 0.067*"deductible" + 0.065*"gift" + 0.051*"flip" + 0.050*"washington" + 0.045*"dedicated" + 0.043*"solely" + 0.042*"rely" + 0.012*"joe"
2023-06-26 20:36:12,749 : INFO : topic #20 (0.010): 0.046*"vote" + 0.031*"voter" + 0.028*"early" + 0.026*"volunteer" + 0.025*"sure" + 0.022*"hear" + 0.021*"join" + 0.017*"tuesday" + 0.016*"rsvp" + 0.015*"cast"
2023-06-26 20:36:12,750 : INFO : topic #74 (0.011): 0.036*"wish" + 0.036*"long" + 0.027*"error" + 0.019*"reach" + 0.018*"immediately" + 0.018*"flip"

2023-06-26 20:36:37,667 : INFO : topic #15 (0.011): 0.078*"tax" + 0.077*"contribution" + 0.077*"deductible" + 0.074*"gift" + 0.055*"washington" + 0.055*"flip" + 0.049*"dedicated" + 0.046*"solely" + 0.046*"rely" + 0.012*"joe"
2023-06-26 20:36:37,668 : INFO : topic #74 (0.011): 0.044*"wish" + 0.043*"long" + 0.033*"error" + 0.020*"reach" + 0.018*"immediately" + 0.017*"save" + 0.017*"close" + 0.017*"flip" + 0.016*"pitch" + 0.015*"raise"
2023-06-26 20:36:37,669 : INFO : topic diff=1.663347, rho=0.301511
2023-06-26 20:36:37,691 : INFO : PROGRESS: pass 10, at document #9978/9978
2023-06-26 20:36:46,509 : INFO : optimized alpha [0.009250428, 0.009355421, 0.009466988, 0.008978087, 0.009571806, 0.009789327, 0.009208955, 0.009132694, 0.009468691, 0.008942479, 0.010522128, 0.0095567675, 0.008825456, 0.009491596, 0.009640359, 0.010729755, 0.009244404, 0.008775632, 0.009073152, 0.010310999, 0.010660058, 0.009199681, 0.0096617015, 0.008637202, 0.009219336, 0.009891271, 0.009058011, 0.010310884, 0.008

2023-06-26 20:37:32,093 : INFO : optimized alpha [0.009223123, 0.009433977, 0.009497496, 0.008977681, 0.009711163, 0.010065144, 0.009227429, 0.009139028, 0.009552886, 0.0089249555, 0.011093764, 0.009680262, 0.008779597, 0.009461815, 0.009751517, 0.011229872, 0.009334559, 0.00868715, 0.009087239, 0.010599332, 0.0110625755, 0.009219204, 0.009790883, 0.008546072, 0.009220905, 0.0100002885, 0.009044936, 0.010520336, 0.008945958, 0.009913551, 0.009874287, 0.00897972, 0.009170789, 0.009018136, 0.01000988, 0.009391038, 0.008959135, 0.008912365, 0.010562451, 0.008820785, 0.010264741, 0.009067087, 0.008904877, 0.009088563, 0.009391206, 0.00949315, 0.009265564, 0.009361759, 0.010169083, 0.008830122, 0.009310117, 0.0099992445, 0.010696394, 0.009506825, 0.009341381, 0.009225511, 0.00967918, 0.009658782, 0.009202421, 0.008811578, 0.00960553, 0.008883072, 0.009003225, 0.0094689, 0.008719429, 0.009201127, 0.00925211, 0.009043635, 0.009188224, 0.009686129, 0.00916802, 0.009072575, 0.009629588, 0.00884

2023-06-26 20:38:18,649 : INFO : topic #23 (0.009): 0.027*"state" + 0.024*"come" + 0.021*"swing" + 0.020*"count" + 0.014*"make" + 0.014*"reelection" + 0.014*"afraid" + 0.013*"tough" + 0.013*"secretary" + 0.012*"time"
2023-06-26 20:38:18,650 : INFO : topic #93 (0.009): 0.028*"buy" + 0.027*"order" + 0.027*"extended" + 0.024*"defend" + 0.022*"extend" + 0.020*"far" + 0.020*"republicans" + 0.020*"ground" + 0.020*"step" + 0.020*"inspire"
2023-06-26 20:38:18,651 : INFO : topic #15 (0.012): 0.091*"tax" + 0.090*"deductible" + 0.089*"contribution" + 0.084*"gift" + 0.062*"washington" + 0.059*"flip" + 0.053*"dedicated" + 0.050*"rely" + 0.050*"solely" + 0.015*"team"
2023-06-26 20:38:18,651 : INFO : topic #88 (0.012): 0.240*"save" + 0.164*"immediately" + 0.027*"protect" + 0.025*"rush" + 0.024*"trump" + 0.012*"raise" + 0.011*"close" + 0.011*"collapse" + 0.008*"stay" + 0.008*"come"
2023-06-26 20:38:18,652 : INFO : topic #74 (0.013): 0.058*"wish" + 0.056*"long" + 0.044*"error" + 0.021*"reach" + 0.018*"

2023-06-26 20:39:06,545 : INFO : topic #88 (0.013): 0.252*"save" + 0.174*"immediately" + 0.028*"protect" + 0.025*"rush" + 0.025*"trump" + 0.013*"raise" + 0.011*"close" + 0.011*"collapse" + 0.009*"come" + 0.009*"call"
2023-06-26 20:39:06,545 : INFO : topic #74 (0.013): 0.063*"wish" + 0.060*"long" + 0.048*"error" + 0.021*"reach" + 0.018*"close" + 0.017*"contribute" + 0.017*"immediately" + 0.017*"represent" + 0.016*"pitch" + 0.016*"raise"
2023-06-26 20:39:06,546 : INFO : topic diff=1.109184, rho=0.218218
2023-06-26 20:39:06,559 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel<num_terms=1984, num_topics=100, decay=0.5, chunksize=10000> in 249.52s', 'datetime': '2023-06-26T20:39:06.558945', 'gensim': '4.3.0', 'python': '3.10.9 (main, Mar  1 2023, 18:23:06) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-46-generic-x86_64-with-glibc2.31', 'event': 'created'}


In [15]:
lda_end = time()
print("LDA run time: " + str(lda_end - lda_start))

LDA run time: 249.58359241485596


In [16]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2023-06-26 20:39:06,621 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2023-06-26 20:39:06,646 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2023-06-26 20:39:06,668 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2023-06-26 20:39:06,690 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2023-06-26 20:39:06,714 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2023-06-26 20:39:06,738 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2023-06-26 20:39:06,767 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2023-06-26 20:39:06,796 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2023-06-26 20:39:06,826 : INFO : CorpusAccumulator accumulated stats from 9000 documents


Average topic coherence: -2.5967.
[([(0.073073395, 'train'),
   (0.0525317, 'office'),
   (0.051552594, 'imagine'),
   (0.049736764, 'fund'),
   (0.026761206, 'expensive'),
   (0.026490519, 'honest'),
   (0.026454229, 'dignity'),
   (0.026426043, 'incredibly'),
   (0.026416609, 'url'),
   (0.026398815, 'promise'),
   (0.025792742, 'small'),
   (0.02543405, 'listen'),
   (0.025415737, 'lot'),
   (0.02503606, 'uphold'),
   (0.024858179, 'redacted'),
   (0.024596632, 'matter'),
   (0.024302218, 'compassion'),
   (0.02394016, 'trump'),
   (0.023809087, 'facebook'),
   (0.02375065, 'explore')],
  -0.7097526677274348),
 ([(0.097429894, 'pass'),
   (0.065562814, 'fund'),
   (0.03690902, 'end'),
   (0.034832, 'fix'),
   (0.034524415, 'fuel'),
   (0.034016315, 'push'),
   (0.033448037, 'ask'),
   (0.032953203, 'overturn'),
   (0.032742053, 'billionaire'),
   (0.032494273, 'history'),
   (0.03234042, 'build'),
   (0.03164415, 'spend'),
   (0.03128952, 'able'),
   (0.03118347, 'wonder'),
   (0.03

In [17]:
model.get_document_topics(corpus[1], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)


[(5, 0.04848647),
 (10, 0.15014538),
 (19, 0.0322239),
 (20, 0.22791053),
 (41, 0.07377615),
 (51, 0.3008111),
 (89, 0.022034163),
 (97, 0.12065601)]

In [18]:
# dist = []
# for topic in lda_output:
#     topic_sum = 0
#     for word in topic:
#         topic_sum += word[0]
#     dist.append(topic_sum)

# normalizer = sum(dist)
    
# normalized = []
# for topic_dist in dist:
#     normalized_topic = topic_dist / normalizer
#     normalized.append(normalized_topic)
    
# print(dist)


In [19]:
# import matplotlib.pyplot as plt

# # Create a list of indices for the x-axis
# x = range(num_topics)
# x_labels = ['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10']

# # Plotting the bar graph
# plt.bar(x, normalized, tick_label=x_labels)

# # Add labels and title
# plt.xlabel('Topics')
# plt.ylabel('Probability')
# plt.title('Topic Probability Distribution')

# # Display the plot
# plt.show()