In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('sample.csv')
print(df)

                            from_name                         from_address  \
0                   Kathleen Williams          info@kathleenformontana.com   
1    Goal Update -- via Team Kathleen          info@kathleenformontana.com   
2                   Kathleen Williams          info@kathleenformontana.com   
3                   Kathleen Williams  teamwilliams@kathleenformontana.com   
4                          Jeff Allen  teamwilliams@kathleenformontana.com   
..                                ...                                  ...   
995           When Democrats Turn Out        info@whendemocratsturnout.com   
996           When Democrats Turn Out        info@whendemocratsturnout.com   
997           When Democrats Turn Out        info@whendemocratsturnout.com   
998           When Democrats Turn Out        info@whendemocratsturnout.com   
999           When Democrats Turn Out        info@whendemocratsturnout.com   

                                               subject  \
0    

In [26]:
body_text = df.loc[:, 'body_text']
emails = []
for text in body_text:
    emails.append(text)

In [27]:
print(emails)



Note: lots of \n separating lines in the string, might interfere with the tokenization process

In [28]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(emails)):
    emails[idx] = emails[idx].lower()  # Convert to lowercase.
    emails[idx] = tokenizer.tokenize(emails[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
emails = [[token for token in email if not token.isnumeric()] for email in emails]

# Remove words that are only one character.
emails = [[token for token in email if len(token) > 1] for email in emails]

In [29]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
emails = [[lemmatizer.lemmatize(token) for token in email] for email in emails]

In [30]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(emails, min_count=20)
for idx in range(len(emails)):
    for token in bigram[emails[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            emails[idx].append(token)

In [31]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(emails)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [32]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(email) for email in emails]

In [33]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1327
Number of documents: 1000


In [34]:
# Enable logging to see the progress of training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [35]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 1100
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2023-06-15 21:57:34,337 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2023-06-15 21:57:34,339 : INFO : using serial LDA version on this node
2023-06-15 21:57:34,344 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1000 documents, updating model once every 1000 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2023-06-15 21:57:34,364 : INFO : PROGRESS: pass 0, at document #1000/1000
2023-06-15 21:57:36,509 : INFO : optimized alpha [0.044926334, 0.0574829, 0.0528426, 0.08544208, 0.076944076, 0.05201748, 0.08454975, 0.055458784, 0.06582604, 0.050008688]
2023-06-15 21:57:36,511 : INFO : topic #0 (0.045): 0.026*"kathleen" + 0.017*"montana" + 0.015*"williams" + 0.012*"kathleen_williams" + 0.008*"time" + 0.007*"who" + 0.007*"send" + 0.007*"bozeman" + 0.007*"bozeman_mt" + 0.007*"mt"
2023-06-15 21:57:36,511 : INFO : topic #9 (0.050): 0.0

2023-06-15 21:57:39,082 : INFO : topic #9 (0.035): 0.044*"georgette" + 0.020*"gómez" + 0.018*"georgette_gómez" + 0.014*"san" + 0.014*"diego" + 0.013*"here" + 0.012*"ca" + 0.011*"georgette_campaign" + 0.009*"community" + 0.009*"san_diego"
2023-06-15 21:57:39,083 : INFO : topic #3 (0.047): 0.034*"georgette" + 0.016*"express" + 0.014*"gómez" + 0.013*"express_donate" + 0.012*"georgette_gómez" + 0.009*"here" + 0.009*"san" + 0.009*"diego" + 0.008*"ca" + 0.007*"like"
2023-06-15 21:57:39,083 : INFO : topic #1 (0.061): 0.029*"jo" + 0.017*"jorgensen" + 0.014*"president" + 0.009*"jo_jorgensen" + 0.009*"voter" + 0.008*"not" + 0.008*"an" + 0.008*"donate_donate" + 0.007*"more" + 0.006*"vote"
2023-06-15 21:57:39,083 : INFO : topic #6 (0.061): 0.025*"democrat" + 0.019*"turn" + 0.012*"election" + 0.012*"his" + 0.011*"turn_out" + 0.009*"house" + 0.009*"he" + 0.009*"when_democrat" + 0.008*"washington" + 0.008*"american"
2023-06-15 21:57:39,084 : INFO : topic diff=0.313436, rho=0.377964
2023-06-15 21:57:3

2023-06-15 21:57:40,904 : INFO : topic diff=0.201279, rho=0.288675
2023-06-15 21:57:40,905 : INFO : PROGRESS: pass 11, at document #1000/1000
2023-06-15 21:57:41,246 : INFO : optimized alpha [0.034301646, 0.051970158, 0.043247785, 0.03621931, 0.037049994, 0.032857314, 0.058086403, 0.031127399, 0.032572597, 0.029008592]
2023-06-15 21:57:41,247 : INFO : topic #9 (0.029): 0.045*"georgette" + 0.021*"gómez" + 0.018*"georgette_gómez" + 0.015*"san" + 0.015*"diego" + 0.013*"here" + 0.012*"ca" + 0.011*"georgette_campaign" + 0.009*"san_diego" + 0.009*"community"
2023-06-15 21:57:41,248 : INFO : topic #7 (0.031): 0.012*"democrat" + 0.012*"add" + 0.011*"their" + 0.011*"turn" + 0.011*"stand" + 0.010*"name" + 0.010*"administration" + 0.010*"add_your" + 0.008*"do" + 0.008*"not"
2023-06-15 21:57:41,248 : INFO : topic #2 (0.043): 0.041*"kathleen" + 0.027*"williams" + 0.024*"montana" + 0.021*"kathleen_williams" + 0.011*"bozeman" + 0.011*"mt" + 0.011*"bozeman_mt" + 0.009*"montanan" + 0.008*"sure" + 0.008

2023-06-15 21:57:42,877 : INFO : topic #1 (0.047): 0.038*"jo" + 0.021*"jorgensen" + 0.015*"president" + 0.012*"voter" + 0.012*"jo_jorgensen" + 0.010*"donate_donate" + 0.008*"debate" + 0.008*"an" + 0.008*"not" + 0.007*"liberty"
2023-06-15 21:57:42,878 : INFO : topic #6 (0.058): 0.026*"democrat" + 0.022*"turn" + 0.014*"his" + 0.013*"election" + 0.012*"turn_out" + 0.011*"survey" + 0.011*"he" + 0.011*"no" + 0.010*"when_democrat" + 0.010*"american"
2023-06-15 21:57:42,878 : INFO : topic diff=0.113212, rho=0.235702
2023-06-15 21:57:42,879 : INFO : PROGRESS: pass 17, at document #1000/1000
2023-06-15 21:57:43,195 : INFO : optimized alpha [0.03297616, 0.046184264, 0.041746575, 0.03195829, 0.035326652, 0.030503538, 0.05793852, 0.031507038, 0.031090094, 0.025978088]
2023-06-15 21:57:43,197 : INFO : topic #9 (0.026): 0.045*"georgette" + 0.021*"gómez" + 0.018*"georgette_gómez" + 0.016*"san" + 0.015*"diego" + 0.013*"here" + 0.012*"ca" + 0.011*"georgette_campaign" + 0.009*"san_diego" + 0.009*"commun

In [36]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2023-06-15 21:58:36,970 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -0.6395.
[([(0.044932503, 'georgette'),
   (0.0207895, 'gómez'),
   (0.01854753, 'georgette_gómez'),
   (0.015613562, 'san'),
   (0.014891664, 'diego'),
   (0.012631061, 'here'),
   (0.012253966, 'ca'),
   (0.010971293, 'georgette_campaign'),
   (0.009452152, 'san_diego'),
   (0.008687527, 'community'),
   (0.008326364, 'like'),
   (0.008269443, 'city'),
   (0.008022755, 'race'),
   (0.007767156, 'donation'),
   (0.0075004315, 'first'),
   (0.007247736, 'california'),
   (0.007212055, 'friend'),
   (0.00719431, 'working'),
   (0.0070423754, 'council'),
   (0.006995589, 'city_council')],
  -0.22188610499886713),
 ([(0.041106887, 'kathleen'),
   (0.02731716, 'williams'),
   (0.023794232, 'montana'),
   (0.021365412, 'kathleen_williams'),
   (0.011185878, 'bozeman'),
   (0.011042638, 'mt'),
   (0.010983549, 'bozeman_mt'),
   (0.008819825, 'montanan'),
   (0.008769716, 'sure'),
   (0.007844163, 'donation'),
   (0.007743464, 'running'),
   (0.0075398865, 'send'),
  

Note: bigrams' original pairs of words tended to come up often, especially with names