In [1]:
import pandas as pd
import spacy
import pickle
from time import time
from spacy.language import Language

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("merge_noun_chunks")
nlp.add_pipe("merge_entities")
nlp.Defaults.stop_words |= {"alex", "unsubscribe"}

# @Language.component("remove_stopwords")
# def remove_stopwords(doc):
#     # This will remove stopwords and punctuation.
#     # Use token.text to return strings, which we'll need for Gensim.
#     doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
#     return doc

#nlp.add_pipe("remove_stopwords")

In [4]:
preprocess_start = time()
df = pd.read_csv('corpus_sample/corpus_10k.csv', index_col=0)

In [5]:
body_text = df.loc[:, 'body_text']

emails = []
include_named_entities = True

for email in list(nlp.pipe(body_text)):
    if include_named_entities:
        emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha() and len(token.text) > 2])
    else:
        emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])

for email in body_text:
    email.replace('\n', ' ')
    email = nlp(text)
    if include_named_entities:
        emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha() and len(token.text) > 2])
    else:
        emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])
        

# for text in body_text:
#     email = nlp(text)
#     if include_named_entities == True:
#         emails.append([token.text.lower() for token in email if not token.is_stop and token.text.isalpha()])
#     else:
#         emails.append([token.text.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])


In [6]:
# for email in list(nlp.pipe(body_text[0:3])):
#     for token in email:
#         print(token.text, token.lemma_,token.pos_)

In [7]:
#print(emails[0:2])

In [8]:
import gensim
import gensim.corpora as corpora
# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(emails)

# Filter out words that occur less than 20 documents, or more than 30% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.3)


corpus = [dictionary.doc2bow(doc) for doc in emails]

In [9]:
with open("dictionary", "wb") as d:
    pickle.dump(dictionary, d)
    
with open("corpus", "wb") as c:
    pickle.dump(corpus, c)

In [10]:
with open("dictionary", "rb") as d:
    dictionary = pickle.load(d)
    
with open("corpus", "rb") as c:
    corpus = pickle.load(c)

In [11]:
preprocess_end = time()
print("Time to pre-process corpus: " + str(preprocess_end - preprocess_start))

Time to pre-process corpus: 652.8817627429962


In [12]:
print(dictionary)

Dictionary<1984 unique tokens: ['ad', 'ally', 'anymore', 'ask', 'check']...>


In [13]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1984
Number of documents: 9978


In [14]:
# Enable logging to see the progress of training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [15]:
lda_start = time()

In [16]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 100
chunksize = 10000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2023-06-27 20:01:14,379 : INFO : using autotuned alpha, starting with [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
2023-06-27 20:01:14,381 : INFO : using serial LDA version on this node
2023-06-27 20:01:14,419 : INFO : running online (multi-pass) LDA training, 100 topics, 20 passes over the supplied corpus of 9978 documents, updating model once every 9978 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence 

2023-06-27 20:01:51,779 : INFO : topic #95 (0.010): 0.053*"pass" + 0.041*"fund" + 0.029*"end" + 0.022*"ask" + 0.021*"spend" + 0.020*"fuel" + 0.020*"rely" + 0.020*"push" + 0.020*"build" + 0.019*"fix"
2023-06-27 20:01:51,779 : INFO : topic diff=0.636039, rho=0.500000
2023-06-27 20:01:51,791 : INFO : PROGRESS: pass 3, at document #9978/9978
2023-06-27 20:01:58,367 : INFO : optimized alpha [0.009369683, 0.00934971, 0.009431244, 0.009324272, 0.009357422, 0.009411565, 0.009780152, 0.009887992, 0.009466288, 0.009300875, 0.009376222, 0.00929123, 0.00968023, 0.009366156, 0.009604839, 0.009367492, 0.009312105, 0.009194914, 0.00927202, 0.0096979095, 0.009222714, 0.009401882, 0.009329106, 0.0093613565, 0.009201171, 0.009390493, 0.009365534, 0.009331953, 0.009419032, 0.009442801, 0.009276021, 0.009379017, 0.009243128, 0.009358588, 0.009509235, 0.00955339, 0.009389683, 0.009426637, 0.009183112, 0.009485309, 0.009544936, 0.009424813, 0.009318125, 0.009363401, 0.009562234, 0.00943372, 0.009282984, 0.0

2023-06-27 20:02:17,375 : INFO : topic #32 (0.009): 0.035*"cory" + 0.013*"ask" + 0.013*"republicans" + 0.013*"reach" + 0.010*"get" + 0.010*"able" + 0.010*"november" + 0.010*"double" + 0.010*"friend" + 0.009*"come"
2023-06-27 20:02:17,376 : INFO : topic #24 (0.009): 0.043*"sign" + 0.028*"signature" + 0.023*"collect" + 0.019*"pass" + 0.013*"allow" + 0.011*"restore" + 0.011*"take" + 0.011*"gutte" + 0.010*"ban" + 0.010*"demand"
2023-06-27 20:02:17,376 : INFO : topic #7 (0.010): 0.085*"texas" + 0.066*"express" + 0.032*"time" + 0.027*"endorsement" + 0.026*"imply" + 0.025*"uniform" + 0.025*"use" + 0.024*"photograph" + 0.021*"november" + 0.020*"wish"
2023-06-27 20:02:17,377 : INFO : topic #67 (0.010): 0.052*"join" + 0.034*"wish" + 0.032*"long" + 0.028*"error" + 0.024*"rsvp" + 0.015*"hope" + 0.013*"represent" + 0.012*"invite" + 0.012*"meet" + 0.010*"change"
2023-06-27 20:02:17,377 : INFO : topic #6 (0.010): 0.069*"flip" + 0.049*"tax" + 0.049*"deductible" + 0.048*"gift" + 0.046*"contribution" + 

2023-06-27 20:02:38,548 : INFO : topic #67 (0.011): 0.066*"join" + 0.035*"wish" + 0.032*"long" + 0.029*"error" + 0.028*"rsvp" + 0.018*"hope" + 0.015*"invite" + 0.013*"represent" + 0.012*"meet" + 0.012*"host"
2023-06-27 20:02:38,548 : INFO : topic #6 (0.011): 0.071*"flip" + 0.056*"tax" + 0.056*"deductible" + 0.055*"gift" + 0.051*"contribution" + 0.041*"washington" + 0.037*"solely" + 0.036*"rely" + 0.036*"dedicated" + 0.021*"defeat"
2023-06-27 20:02:38,549 : INFO : topic diff=1.658924, rho=0.301511
2023-06-27 20:02:38,562 : INFO : PROGRESS: pass 10, at document #9978/9978
2023-06-27 20:02:45,888 : INFO : optimized alpha [0.009605986, 0.00920723, 0.009257339, 0.009441039, 0.009260903, 0.009401391, 0.010787357, 0.010641732, 0.009454413, 0.009077622, 0.009325497, 0.00884391, 0.010536422, 0.009083143, 0.010057774, 0.009325904, 0.00897661, 0.00893054, 0.008907042, 0.009632285, 0.008842006, 0.009103305, 0.009004522, 0.009262107, 0.00881974, 0.009163841, 0.009007311, 0.00913151, 0.009634609, 0.

2023-06-27 20:03:01,556 : INFO : PROGRESS: pass 13, at document #9978/9978
2023-06-27 20:03:09,834 : INFO : optimized alpha [0.009782587, 0.009214264, 0.009246216, 0.009578963, 0.009269774, 0.009440312, 0.011164464, 0.01091596, 0.009496618, 0.00903551, 0.009351688, 0.008737311, 0.010841469, 0.009021192, 0.010265052, 0.009442344, 0.008973663, 0.00893646, 0.008838962, 0.009662827, 0.00880396, 0.009083257, 0.008927173, 0.0093346, 0.008809968, 0.009201739, 0.00896417, 0.009120811, 0.009776652, 0.009186488, 0.008892747, 0.009257858, 0.008511881, 0.010176275, 0.009883623, 0.010041269, 0.009491336, 0.009213202, 0.008789431, 0.009167994, 0.010026849, 0.0089247795, 0.00920796, 0.009541556, 0.009626246, 0.009636661, 0.00897559, 0.009774779, 0.009102158, 0.00885756, 0.009536527, 0.009605761, 0.0091878865, 0.010078759, 0.00925083, 0.00936189, 0.009027574, 0.008668961, 0.009248509, 0.0088613285, 0.008987202, 0.010146554, 0.0089433305, 0.0096002435, 0.009799511, 0.009531552, 0.010068522, 0.010993247

2023-06-27 20:03:37,750 : INFO : topic #32 (0.008): 0.041*"cory" + 0.037*"double" + 0.022*"past" + 0.019*"nearly" + 0.017*"able" + 0.016*"get" + 0.015*"ask" + 0.014*"seal" + 0.013*"endure" + 0.013*"risk"
2023-06-27 20:03:37,751 : INFO : topic #87 (0.009): 0.086*"candace" + 0.032*"educator" + 0.032*"consider" + 0.027*"serve" + 0.026*"make" + 0.021*"fair" + 0.020*"hate" + 0.020*"reelect" + 0.019*"flip" + 0.018*"view"
2023-06-27 20:03:37,753 : INFO : topic #70 (0.011): 0.056*"long" + 0.053*"wish" + 0.046*"reach" + 0.044*"error" + 0.025*"voter" + 0.025*"raise" + 0.023*"sure" + 0.023*"close" + 0.020*"team" + 0.018*"count"
2023-06-27 20:03:37,754 : INFO : topic #6 (0.012): 0.074*"flip" + 0.064*"tax" + 0.063*"deductible" + 0.062*"gift" + 0.056*"contribution" + 0.043*"washington" + 0.039*"rely" + 0.038*"solely" + 0.038*"dedicated" + 0.021*"defeat"
2023-06-27 20:03:37,755 : INFO : topic #96 (0.012): 0.066*"end" + 0.045*"hit" + 0.039*"deadline" + 0.036*"month" + 0.035*"goal" + 0.031*"short" + 0.

2023-06-27 20:04:06,971 : INFO : topic #70 (0.012): 0.059*"long" + 0.058*"wish" + 0.047*"error" + 0.046*"reach" + 0.026*"voter" + 0.025*"raise" + 0.023*"close" + 0.023*"sure" + 0.020*"team" + 0.018*"count"
2023-06-27 20:04:06,972 : INFO : topic #96 (0.012): 0.072*"end" + 0.047*"hit" + 0.042*"deadline" + 0.040*"month" + 0.037*"goal" + 0.031*"short" + 0.030*"reach" + 0.026*"quarter" + 0.025*"fundraising" + 0.022*"close"
2023-06-27 20:04:06,973 : INFO : topic diff=1.117970, rho=0.218218
2023-06-27 20:04:06,987 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel<num_terms=1984, num_topics=100, decay=0.5, chunksize=10000> in 172.57s', 'datetime': '2023-06-27T20:04:06.987120', 'gensim': '4.3.0', 'python': '3.10.9 (main, Mar  1 2023, 18:23:06) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-46-generic-x86_64-with-glibc2.31', 'event': 'created'}


In [17]:
lda_end = time()
print("LDA run time: " + str(lda_end - lda_start))

LDA run time: 172.62872314453125


In [18]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2023-06-27 20:04:07,058 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2023-06-27 20:04:07,079 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2023-06-27 20:04:07,101 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2023-06-27 20:04:07,121 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2023-06-27 20:04:07,143 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2023-06-27 20:04:07,164 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2023-06-27 20:04:07,188 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2023-06-27 20:04:07,212 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2023-06-27 20:04:07,238 : INFO : CorpusAccumulator accumulated stats from 9000 documents


Average topic coherence: -2.6783.
[([(0.07137371, 'train'),
   (0.056720182, 'fund'),
   (0.052987993, 'office'),
   (0.052346267, 'imagine'),
   (0.026470333, 'honest'),
   (0.0264329, 'incredibly'),
   (0.026033504, 'promise'),
   (0.025900483, 'listen'),
   (0.025606163, 'uphold'),
   (0.025306053, 'expensive'),
   (0.02524842, 'url'),
   (0.024923734, 'lot'),
   (0.024826813, 'matter'),
   (0.024604054, 'small'),
   (0.023759613, 'try'),
   (0.023753453, 'compassion'),
   (0.023751263, 'spend'),
   (0.023634853, 'explore'),
   (0.023558384, 'redacted'),
   (0.023411142, 'facebook')],
  -0.7329893336652369),
 ([(0.07377139, 'pass'),
   (0.053457424, 'fund'),
   (0.03629846, 'end'),
   (0.029529387, 'fuel'),
   (0.028596908, 'fix'),
   (0.028323092, 'push'),
   (0.02799751, 'ask'),
   (0.027856652, 'spend'),
   (0.027573084, 'build'),
   (0.027410472, 'wonder'),
   (0.027145714, 'billionaire'),
   (0.027084764, 'overturn'),
   (0.026775006, 'ultimately'),
   (0.026510073, 'give'),
  

In [19]:
model.get_document_topics(corpus[1], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)


[(22, 0.069505155),
 (53, 0.02410836),
 (61, 0.022403006),
 (64, 0.7564037),
 (73, 0.10270035)]

In [24]:
top_topics = [1, 2, 3, 4, 5] # add topic IDs manually
top_documents = {}

for id in top_topics:
    top_documents[id] = []

for i in range(len(corpus)):
    email_id = i# df.at[i, "uid_email"]
    doc_topics = model.get_document_topics(corpus[i], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    for pair in doc_topics:
        topic_id = pair[0]
        probability = pair[1]
        if topic_id in top_topics:
            top_documents[topic_id].append((email_id, probability))


In [29]:
for id in top_topics:
    top_documents[id].sort(key=lambda probability: probability[1], reverse=True)
    print(top_documents[id][0:5])

[(3739, 0.98518634), (4905, 0.6457401), (4901, 0.6423274), (4908, 0.6237877), (1636, 0.62198913)]
[(7397, 0.98312247), (8679, 0.98312247), (8677, 0.9818477), (8276, 0.9803649), (5491, 0.97995543)]
[(7737, 0.7640803), (7757, 0.7386464), (7772, 0.7314077), (4634, 0.7173265), (1967, 0.71372485)]
[(1514, 0.9871748), (4288, 0.9795295), (1518, 0.9419523), (3263, 0.9212611), (3971, 0.89670515)]
[(4050, 0.9799619), (3455, 0.9582447), (3457, 0.9555539), (3134, 0.9421051), (3142, 0.9038673)]


In [20]:
# dist = []
# for topic in lda_output:
#     topic_sum = 0
#     for word in topic:
#         topic_sum += word[0]
#     dist.append(topic_sum)

# normalizer = sum(dist)
    
# normalized = []
# for topic_dist in dist:
#     normalized_topic = topic_dist / normalizer
#     normalized.append(normalized_topic)
    
# print(dist)


In [21]:
# import matplotlib.pyplot as plt

# # Create a list of indices for the x-axis
# x = range(num_topics)
# x_labels = ['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10']

# # Plotting the bar graph
# plt.bar(x, normalized, tick_label=x_labels)

# # Add labels and title
# plt.xlabel('Topics')
# plt.ylabel('Probability')
# plt.title('Topic Probability Distribution')

# # Display the plot
# plt.show()