I am using Enron Email Dataset taken from https://www.cs.cmu.edu/~./enron/, it is about 1.3 gb in size and contains half a million emails.

In [1]:
import pandas as pd
import re
import nltk
import gensim
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
stemmer=WordNetLemmatizer()
en_stop=nltk.corpus.stopwords.words('english')
sns.set()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\risha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df=pd.read_csv('emails.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   file     517401 non-null  object
 1   message  517401 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB


In [5]:
df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [6]:
df.iloc[0].message

"Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "

In [7]:
def preprocess_text(doc):
    #remove special chars
    doc=re.sub(r'\W',' ',doc)
    
    #remove single chars
    doc=re.sub(r'\s+[a-zA-Z]\s+',' ',doc)
    
    #remove single chars from start
    doc=re.sub(r'\s+[a-zA-Z]\s+',' ',doc)
    
    #remove multiple spaces with single space
    doc=re.sub(r'\s+',' ',doc,flags=re.I)
    
    #remove prefixed 'b'
    doc=re.sub(r'^b\s+','',doc)
    
    doc=doc.lower()
    tokens=doc.split()
    tokens=[stemmer.lemmatize(word) for word in tokens]
    tokens=[word for word in tokens if word not in en_stop]
    tokens=[word for word in tokens if len(word) > 5]
    
    return tokens

In [8]:
preprocess_text(df.iloc[0].message)

['message',
 '18782981',
 '1075855378110',
 'javamail',
 'phillip',
 'belden',
 'subject',
 'version',
 'content',
 'charset',
 'content',
 'transfer',
 'encoding',
 'phillip',
 'belden',
 'belden',
 'enronxgate',
 'folder',
 'phillip_allen_jan2002_1',
 'phillip',
 'origin',
 'filename',
 'pallen',
 'privileged',
 'forecast']

In [9]:
df.columns

Index(['file', 'message'], dtype='object')

In [10]:
df.file[100]

'allen-p/_sent_mail/185.'

In [11]:
df.message[100]

"Message-ID: <15940494.1075855689309.JavaMail.evans@thyme>\nDate: Wed, 16 Aug 2000 05:35:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: cooper.richey@enron.com\nSubject: Re:\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Cooper Richey\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Dec2000\\Notes Folders\\'sent mail\nX-Origin: Allen-P\nX-FileName: pallen.nsf\n\nI tried the new address but I don't have access.  also, what do I need to \nenter under domain?"

In [12]:
messages=df.message.values.tolist()

In [13]:
message_tokens=[]
for msg in tqdm(messages):
    message_tokens.append(preprocess_text(msg))

100%|█████████████████████████████████████████████████████████████████████████| 517401/517401 [26:05<00:00, 330.55it/s]


# Latent Dirichlet Allocation model

In [14]:
from gensim import corpora

In [15]:
input_dict=corpora.Dictionary(message_tokens)

In [17]:
input_corpus=[input_dict.doc2bow(token,allow_update=True) for token in message_tokens]

In [19]:
input_corpus[:2]

[[(0, 1),
  (1, 1),
  (2, 3),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 3),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1)],
 [(3, 1),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 3),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 4),
  (26, 1),
  (27, 1),
  (28, 2),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 3),
  (36, 5),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1)]]

In [21]:
input_dict.get(243)

'extremely'

In [22]:
lda_model=gensim.models.ldamodel.LdaModel(input_corpus,num_topics=4,id2word=input_dict,passes=20)

In [29]:
topics=lda_model.print_topics(num_words=10)
topics

[(0,
  '0.039*"watson" + 0.021*"kwatson" + 0.016*"kimberly" + 0.013*"hotmail" + 0.011*"pybarbo" + 0.010*"williams" + 0.009*"cherry" + 0.008*"border" + 0.007*"harris" + 0.007*"height"'),
 (1,
  '0.113*"recipient" + 0.017*"williams" + 0.011*"michael" + 0.009*"zipper" + 0.009*"content" + 0.009*"notesaddr" + 0.007*"enron_development" + 0.006*"whalley" + 0.006*"taylor" + 0.006*"robert"'),
 (2,
  '0.052*"content" + 0.048*"subject" + 0.041*"message" + 0.039*"folder" + 0.027*"version" + 0.026*"transfer" + 0.026*"charset" + 0.026*"encoding" + 0.025*"filename" + 0.025*"origin"'),
 (3,
  '0.011*"energy" + 0.010*"company" + 0.008*"market" + 0.007*"service" + 0.007*"business" + 0.005*"trading" + 0.004*"information" + 0.004*"report" + 0.004*"product" + 0.004*"customer"')]

In [30]:
test_text='where to put this folder for trasferring this data content'

In [31]:
test=preprocess_text(test_text)
test=input_dict.doc2bow(test)

res_probabs=lda_model.get_document_topics(test)
res_probabs

[(0, 0.08335087), (1, 0.084231295), (2, 0.74892306), (3, 0.08349477)]