In [3]:
import pandas as pd
import spacy
import pickle
from time import time
from spacy.language import Language

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("merge_noun_chunks")
nlp.add_pipe("merge_entities")
nlp.Defaults.stop_words |= {"alex", "unsubscribe"}

# @Language.component("remove_stopwords")
# def remove_stopwords(doc):
#     # This will remove stopwords and punctuation.
#     # Use token.text to return strings, which we'll need for Gensim.
#     doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
#     return doc

#nlp.add_pipe("remove_stopwords")

In [4]:
preprocess_start = time()
df = pd.read_csv('corpus_sample/sample.csv')

In [7]:
import tqdm

body_text = df.loc[:, 'body_text']

emails = []
include_named_entities = False
print("starting spacy preprocessing pipeline")
for email in list(nlp.pipe(body_text)):
    if include_named_entities == True:
        emails.append([token.text.lower() for token in email if not token.is_stop and token.text.isalpha()])
    else:
        emails.append([token.text.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])

# for text in body_text:
#     email = nlp(text)
#     if include_named_entities:
#         emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha()])
#     else:
#         emails.append([token.lemma_.lower() for token in email if not token.is_stop and token.text.isalpha() and not token.ent_type_])


starting spacy preprocessing pipeline


KeyboardInterrupt: 

In [4]:
# for email in list(nlp.pipe(body_text[0:3])):
#     for token in email:
#         print(token.text, token.lemma_,token.pos_)

In [5]:
import gensim
import gensim.corpora as corpora
# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(emails)

# Filter out words that occur less than 10 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.5)


corpus = [dictionary.doc2bow(doc) for doc in emails]

In [6]:
with open("dictionary", "wb") as d:
    pickle.dump(dictionary, d)
    
with open("corpus", "wb") as c:
    pickle.dump(corpus, c)

In [17]:
with open("dictionary", "rb") as d:
    dictionary = pickle.load(d)
    
with open("corpus", "rb") as c:
    corpus = pickle.load(c)

In [18]:
preprocess_end = time()
print("Time to pre-process corpus: " + str(preprocess_end - preprocess_start))

Time to pre-process corpus: 267.19581604003906


In [19]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 781
Number of documents: 1000


In [20]:
# Enable logging to see the progress of training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [21]:
lda_start = time()

In [22]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 30
chunksize = 10000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2023-06-23 11:28:47,613 : INFO : using autotuned alpha, starting with [0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335]
2023-06-23 11:28:47,615 : INFO : using serial LDA version on this node
2023-06-23 11:28:47,616 : INFO : running online (multi-pass) LDA training, 30 topics, 20 passes over the supplied corpus of 1000 documents, updating model once every 1000 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2023-06-23 11:28:47,617 : INFO : PROGRESS: pass 0, at document #1000/1000
2023-06-23 11:28:48,157 : INFO : optimized alpha [0.028946212, 0.031154182, 0.028833129, 0.030035164, 0.029902, 0.0

2023-06-23 11:28:48,952 : INFO : topic #1 (0.030): 0.080*"turn" + 0.039*"dc" + 0.039*"yes" + 0.030*"vote" + 0.025*"tell" + 0.021*"like" + 0.017*"know" + 0.015*"said" + 0.013*"support" + 0.013*"help"
2023-06-23 11:28:48,952 : INFO : topic #7 (0.030): 0.037*"support" + 0.037*"like" + 0.031*"stay" + 0.030*"stand" + 0.026*"endorsed" + 0.026*"email" + 0.024*"click" + 0.024*"date" + 0.024*"elected" + 0.024*"hate"
2023-06-23 11:28:48,952 : INFO : topic #26 (0.033): 0.037*"send" + 0.032*"want" + 0.030*"running" + 0.029*"help" + 0.027*"join" + 0.027*"reaching" + 0.026*"protect" + 0.026*"mt" + 0.026*"receive" + 0.026*"safe"
2023-06-23 11:28:48,953 : INFO : topic diff=0.448821, rho=0.408248
2023-06-23 11:28:48,954 : INFO : PROGRESS: pass 5, at document #1000/1000
2023-06-23 11:28:49,116 : INFO : optimized alpha [0.024223315, 0.029801624, 0.022778112, 0.026089318, 0.029679826, 0.026814735, 0.024417344, 0.0298318, 0.024070516, 0.023269106, 0.02340004, 0.02916873, 0.026841033, 0.025708264, 0.0255672

2023-06-23 11:28:49,731 : INFO : topic #4 (0.030): 0.060*"express" + 0.043*"like" + 0.028*"running" + 0.024*"stand" + 0.024*"elected" + 0.023*"support" + 0.023*"stay" + 0.023*"endorsed" + 0.022*"recenter" + 0.022*"date"
2023-06-23 11:28:49,731 : INFO : topic #26 (0.036): 0.040*"send" + 0.034*"want" + 0.032*"help" + 0.031*"running" + 0.030*"join" + 0.029*"mt" + 0.028*"reaching" + 0.028*"protect" + 0.028*"receive" + 0.028*"follow"
2023-06-23 11:28:49,731 : INFO : topic diff=0.441446, rho=0.301511
2023-06-23 11:28:49,732 : INFO : PROGRESS: pass 10, at document #1000/1000
2023-06-23 11:28:49,877 : INFO : optimized alpha [0.0222231, 0.029611364, 0.0202777, 0.024365963, 0.029945489, 0.024923392, 0.0223135, 0.029857188, 0.021864813, 0.020651845, 0.020858727, 0.028260313, 0.025830386, 0.024221933, 0.025797568, 0.022655824, 0.022680582, 0.024121074, 0.022679867, 0.022036934, 0.024059922, 0.02395288, 0.023419356, 0.020130131, 0.02066938, 0.025284307, 0.036835846, 0.023655713, 0.023881909, 0.0267

2023-06-23 11:28:50,465 : INFO : topic #4 (0.030): 0.060*"express" + 0.043*"like" + 0.028*"running" + 0.024*"elected" + 0.024*"stand" + 0.023*"support" + 0.023*"stay" + 0.023*"endorsed" + 0.021*"date" + 0.021*"recenter"
2023-06-23 11:28:50,465 : INFO : topic #26 (0.039): 0.041*"send" + 0.034*"want" + 0.032*"help" + 0.031*"running" + 0.030*"join" + 0.029*"mt" + 0.029*"reaching" + 0.028*"protect" + 0.028*"follow" + 0.028*"receive"
2023-06-23 11:28:50,465 : INFO : topic diff=0.280209, rho=0.250000
2023-06-23 11:28:50,467 : INFO : PROGRESS: pass 15, at document #1000/1000
2023-06-23 11:28:50,607 : INFO : optimized alpha [0.020934973, 0.02939769, 0.018771151, 0.023247754, 0.030293688, 0.023860775, 0.02101294, 0.029945826, 0.020596914, 0.019020358, 0.019324994, 0.02766459, 0.025294542, 0.023252062, 0.026382908, 0.02131059, 0.021672243, 0.022929423, 0.021349916, 0.020846874, 0.022851717, 0.022771027, 0.02255731, 0.018450411, 0.019563476, 0.024561645, 0.039076507, 0.022686884, 0.02294057, 0.02

2023-06-23 11:28:51,189 : INFO : topic #4 (0.030): 0.059*"express" + 0.043*"like" + 0.028*"running" + 0.024*"elected" + 0.024*"stand" + 0.023*"support" + 0.023*"stay" + 0.022*"endorsed" + 0.021*"date" + 0.021*"recenter"
2023-06-23 11:28:51,189 : INFO : topic #26 (0.041): 0.041*"send" + 0.034*"want" + 0.032*"help" + 0.031*"running" + 0.030*"join" + 0.029*"reaching" + 0.029*"mt" + 0.028*"protect" + 0.028*"follow" + 0.028*"receive"
2023-06-23 11:28:51,189 : INFO : topic diff=0.171769, rho=0.218218
2023-06-23 11:28:51,190 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel<num_terms=781, num_topics=30, decay=0.5, chunksize=10000> in 3.57s', 'datetime': '2023-06-23T11:28:51.190802', 'gensim': '4.3.1', 'python': '3.11.1 (v3.11.1:a7a450f84a, Dec  6 2022, 15:24:06) [Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.3-arm64-arm-64bit', 'event': 'created'}


In [23]:
lda_end = time()
print("LDA run time: " + str(lda_end - lda_start))

LDA run time: 3.584449052810669


In [24]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2023-06-23 11:28:51,207 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -2.5144.
[([(0.04114365, 'send'),
   (0.033993747, 'want'),
   (0.032094512, 'help'),
   (0.03065659, 'running'),
   (0.030131951, 'join'),
   (0.029161625, 'reaching'),
   (0.028918454, 'mt'),
   (0.028366111, 'protect'),
   (0.028147332, 'follow'),
   (0.027976215, 'receive'),
   (0.027506795, 'safe'),
   (0.027139328, 'anymore'),
   (0.026932793, 'fix'),
   (0.026689371, 'works'),
   (0.02668776, 'sent'),
   (0.026583383, 'isolated'),
   (0.026409546, 'feeling'),
   (0.024876136, 'mail'),
   (0.021185515, 'monthly'),
   (0.01859789, 'immediately')],
  -0.49744246556551003),
 ([(0.059456293, 'express'),
   (0.04289329, 'like'),
   (0.028380226, 'running'),
   (0.024362314, 'elected'),
   (0.024122542, 'stand'),
   (0.023382204, 'support'),
   (0.022971401, 'stay'),
   (0.022405878, 'endorsed'),
   (0.021345077, 'date'),
   (0.02121285, 'recenter'),
   (0.020868726, 'email'),
   (0.020803528, 'hate'),
   (0.020241048, 'supporting'),
   (0.020045849, 'click'),


In [25]:
# dist = []
# for topic in lda_output:
#     topic_sum = 0
#     for word in topic:
#         topic_sum += word[0]
#     dist.append(topic_sum)

# normalizer = sum(dist)
    
# normalized = []
# for topic_dist in dist:
#     normalized_topic = topic_dist / normalizer
#     normalized.append(normalized_topic)
    
# print(dist)


In [26]:
# import matplotlib.pyplot as plt

# # Create a list of indices for the x-axis
# x = range(num_topics)
# x_labels = ['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10']

# # Plotting the bar graph
# plt.bar(x, normalized, tick_label=x_labels)

# # Add labels and title
# plt.xlabel('Topics')
# plt.ylabel('Probability')
# plt.title('Topic Probability Distribution')

# # Display the plot
# plt.show()