In [1]:
from os import path
from collections import Counter
#import pycountry
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re # for removing numbers
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim import models,corpora
%matplotlib inline



In [2]:
#Get csv file
df = pd.read_csv(path.join('hillary-clinton-emails', 'emails.csv'))

Since RawText contains all messages and we want to get the idea of the discussed topics we will use RawText

In [3]:
#Example
df['RawText'][0]

'UNCLASSIFIED\nU.S. Department of State\nCase No. F-2015-04841\nDoc No. C05739545\nDate: 05/13/2015\nSTATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM.\nSUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER.\nRELEASE IN FULL\nFrom: Sullivan, Jacob J <Sullivan11@state.gov>\nSent: Wednesday, September 12, 2012 10:16 AM\nTo:\nSubject: FW: Wow\nFrom: Brose, Christian (Armed Services) (mailto:Christian_Brose@armed-servic,essenate.govi\nSent: Wednesday, September 12, 2012 10:09 AM\nTo: Sullivan, Jacob J\nSubject: Wow\nWhat a wonderful, strong and moving statement by your boss. please tell her how much Sen. McCain appreciated it. Me\ntoo\nUNCLASSIFIED\nU.S. Department of State\nCase No. F-2015-04841\nDoc No. C05739545\nDate: 05/13/2015\nSTATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM.\nSUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER. STATE-5CB0045247\n\x0c'

In [4]:
useful_data=df['RawText']# useful_data=df['RawText']
#Delete words with numbers
without_num = [re.sub(r'\d+', '', t) for t in useful_data]

In [5]:
type(without_num)

list

In [6]:
#tokenize the data
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokennized_text=[tokenizer.tokenize(d) for d in without_num]

Delete words that are from stopwords and also the ones that are less than 3 letters.

In [7]:
#stopwords
sw = set(nltk.corpus.stopwords.words('english'))
#get rid of unnecessary words
tokennized_text=[[s for s in t if (len(s)>2 and s not in sw)] for t in tokennized_text]

In [8]:
#convert list of lists to one flattened list for easier data analysis steps below
flattened = []
for t in tokennized_text:
    for w in t:
        flattened.append(w)

In [9]:
# now check words with highest frequency to see whether some of the words arent from email content 
from collections import Counter
counts = Counter(flattened)
#print most common words
print(counts.most_common(200))

[('State', 29830), ('Department', 28674), ('UNCLASSIFIED', 26910), ('Date', 26826), ('Case', 26564), ('Doc', 26539), ('From', 19269), ('Sent', 18847), ('Subject', 18421), ('state', 12797), ('The', 11733), ('gov', 11346), ('RELEASE', 7951), ('Message', 7314), ('Original', 7244), ('com', 6675), ('would', 5684), ('Huma', 5681), ('said', 5579), ('Cheryl', 5531), ('Abedin', 5191), ('clintonemail', 4995), ('Mills', 4763), ('PART', 4347), ('Secretary', 4005), ('Sullivan', 3908), ('call', 3824), ('FULL', 3588), ('Obama', 3548), ('Jacob', 3463), ('also', 3452), ('one', 3442), ('time', 3370), ('government', 3251), ('Clinton', 3088), ('President', 3022), ('people', 2969), ('new', 2863), ('This', 2813), ('know', 2783), ('HDR', 2764), ('STATE', 2646), ('work', 2579), ('like', 2532), ('But', 2525), ('get', 2504), ('could', 2480), ('United', 2428), ('AbedinH', 2382), ('American', 2379), ('security', 2342), ('http', 2285), ('two', 2218), ('May', 2165), ('Friday', 2129), ('And', 2091), ('Thursday', 207

By checking the most common words above we can see that around first 15 words are not from content of the messages in emails. These words are coming from email headers that doesn't represent the email content So, it would be safe to delete them for topic modeling of the conversations. 

In [10]:
# get those top 15 words that arent useful for our purposes
highest_frequency=counts.most_common(15)

In [11]:
unuseful_words=[]
for i in range(0,len(highest_frequency)):
    unuseful_words.append(highest_frequency[i][0])

In [12]:
#add other unuseful words from high frequency words that seems are out of context
unuseful_words.append('www')
unuseful_words.append('http')
unuseful_words.append('com')
unuseful_words.append('PART')
unuseful_words.append('FULL')
unuseful_words.append('HDR')
#print unuseful words
unuseful_words

['State',
 'Department',
 'UNCLASSIFIED',
 'Date',
 'Case',
 'Doc',
 'From',
 'Sent',
 'Subject',
 'state',
 'The',
 'gov',
 'RELEASE',
 'Message',
 'Original',
 'www',
 'http',
 'com',
 'PART',
 'FULL',
 'HDR']

Update tokenized text after deleting the not useful words

In [13]:
tokennized_text_n=[[s for s in t if s not in unuseful_words] for t in tokennized_text]

Delete sentences(documents) with less than or equal to 5 words since they dont contain any useful information for topic modeling.

In [14]:
tokennized_text_=[]
for t in tokennized_text_n:
    if(len(t)>5):
        tokennized_text_.append(t)

Now we are ready for topic modelling from our documents.

In [20]:
# get statistics about all tokens
dictionary = corpora.Dictionary(tokennized_text_)
#create a corpus of documents
corpus = [dictionary.doc2bow(t) for t in tokennized_text_]
#apply LDA model
lda=models.LdaModel(corpus, num_topics=5,id2word=dictionary)

In [21]:
lda.print_topics()

[(0,
  '0.007*"said" + 0.006*"Obama" + 0.004*"would" + 0.003*"Clinton" + 0.003*"Haiti" + 0.003*"House" + 0.002*"President" + 0.002*"also" + 0.002*"time" + 0.002*"people"'),
 (1,
  '0.005*"would" + 0.004*"government" + 0.004*"Israel" + 0.003*"one" + 0.003*"people" + 0.003*"American" + 0.003*"new" + 0.003*"said" + 0.003*"also" + 0.003*"United"'),
 (2,
  '0.008*"Cheryl" + 0.008*"Mills" + 0.006*"Sullivan" + 0.006*"Jacob" + 0.004*"clintonemail" + 0.004*"would" + 0.003*"MillsCD" + 0.003*"said" + 0.003*"Feb" + 0.003*"Monday"'),
 (3,
  '0.012*"Secretary" + 0.008*"Office" + 0.005*"Cheryl" + 0.004*"Mills" + 0.003*"Room" + 0.003*"President" + 0.003*"MEETING" + 0.003*"route" + 0.003*"ARRIVE" + 0.002*"DEPART"'),
 (4,
  '0.020*"Huma" + 0.018*"Abedin" + 0.014*"call" + 0.012*"clintonemail" + 0.008*"AbedinH" + 0.008*"Sullivan" + 0.006*"Jacob" + 0.005*"hrod" + 0.004*"Sunday" + 0.004*"May"')]

In [17]:
#example with many topics,e.g.15 topics
lda=models.LdaModel(corpus, num_topics=15,id2word=dictionary)
lda.print_topics()

[(0,
  '0.008*"Lissa" + 0.007*"Muscatine" + 0.006*"Israel" + 0.004*"said" + 0.003*"Israeli" + 0.003*"American" + 0.003*"one" + 0.003*"draft" + 0.002*"also" + 0.002*"government"'),
 (1,
  '0.005*"David" + 0.004*"would" + 0.004*"health" + 0.003*"family" + 0.003*"Friends" + 0.003*"Haiti" + 0.003*"time" + 0.002*"care" + 0.002*"letter" + 0.002*"Rio"'),
 (2,
  '0.011*"SES" + 0.011*"Huma" + 0.011*"NEWS" + 0.010*"Abedin" + 0.009*"Reuters" + 0.008*"AbedinH" + 0.007*"O_Shift" + 0.006*"Mahogany" + 0.006*"said" + 0.005*"Cheryl"'),
 (3,
  '0.005*"Germany" + 0.003*"freedom" + 0.003*"disc" + 0.003*"said" + 0.003*"world" + 0.003*"American" + 0.003*"new" + 0.002*"Europe" + 0.002*"history" + 0.002*"Berlin"'),
 (4,
  '0.011*"Secretary" + 0.009*"Office" + 0.005*"Room" + 0.004*"MEETING" + 0.004*"route" + 0.004*"ARRIVE" + 0.004*"DEPART" + 0.003*"India" + 0.003*"Residence" + 0.003*"Private"'),
 (5,
  '0.006*"would" + 0.004*"said" + 0.004*"one" + 0.004*"know" + 0.003*"get" + 0.003*"people" + 0.003*"could" + 0

### Conclusion
There is not any exact decisive rule to decide how many topics would be sufficient. From checking different numbers we come up to conclusion that 5 topics would be good enough. Increasing to more topics does not add very useful topic name content,but as above we show as an example choosing 15 topics also is not illogical.