###Import Libraries

In [36]:
import pandas as pd
import numpy as np
import sqlite3
from nltk.corpus import stopwords
import nltk
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer


###Setup and Review Sqlite Database

In [2]:
con = sqlite3.connect('database.sqlite')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
[x for x in cursor.fetchall()]

[(u'Emails',), (u'Persons',), (u'Aliases',), (u'EmailReceivers',)]

###Read Email Data

In [3]:
emails= pd.read_sql_query("Select * From Emails where ExtractedBodyText!= ''",con)
persons=pd.read_sql_query("Select * From Persons",con)
longemails= pd.read_sql_query("Select * From Emails where length(ExtractedBodyText)>500 and ExtractedBodyText!= ''",con)


In [4]:
print emails.shape
print emails.columns

(6742, 22)
Index([u'Id', u'DocNumber', u'MetadataSubject', u'MetadataTo', u'MetadataFrom',
       u'SenderPersonId', u'MetadataDateSent', u'MetadataDateReleased',
       u'MetadataPdfLink', u'MetadataCaseNumber', u'MetadataDocumentClass',
       u'ExtractedSubject', u'ExtractedTo', u'ExtractedFrom', u'ExtractedCc',
       u'ExtractedDateSent', u'ExtractedCaseNumber', u'ExtractedDocNumber',
       u'ExtractedDateReleased', u'ExtractedReleaseInPartOrFull',
       u'ExtractedBodyText', u'RawText'],
      dtype='object')


###Clean up text of email body

In [5]:
# Uncomment the next line to download stop words if it's not already installed.
#nltk.download()

#print stopwords.words("english") # stop word list from the nltk corpus

####Notes:
* Need to remove any text after "U.S. Department of State","Sent from Verizon"
* Need to remove any strange non-English words
* How do I deal with emails that list schedule for the day?
* After defining topics , determine similarity between emails for clustering. Possibly clustering texts with KMeans
* Need to stem words? Keep a word dictionary for all stem words, they don't make sense otherwise
* Look specifically at emails with **fwd** 
* Assume that pis is pls


In [62]:
p_stemmer = SnowballStemmer("english")
l_stemmer=LancasterStemmer()
def review_to_words(email_body):
    # Function to convert a raw emails to a string of words
    # The input is a single string (a raw email), and 
    # the output is a single string (a preprocessed email)
    #
    # Remove HTML
    #email_text = BeautifulSoup(email_body).get_text() 
    #
    # Remove instances of \n
    email_text=re.sub("\n"," ", email_body)
    #
    # Remove boilerplate U.S. State Department 
    email_text=email_text.split("U.S. Department of State",1)[0]
    #
    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", email_text) 
    #
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # Convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    #
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # Remove all words that are shorter than 2 characters
    actual_words = [w for w in meaningful_words if len(w)>2]   
    # 
    # Stem words
    #stemmed_tokens = [l_stemmer.stem(i) for i in actual_words]
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    #
    return( " ".join( actual_words ))


In [63]:
review_to_words(emails["ExtractedBodyText"][3])

u'pis print hrod clintonernailcom wednesday september russorv state gov meet right wing extremist behind anti fvluslim film sparked deadly riots meat sent wednesday september subject meet right wing extremist behind anti muslim film sparked deadly riots htte maxbiumenthal com meet right wing extremist behind anti musiim tihn sparked deadly riots sent verizon wireless lte droid'

In [67]:
#review_to_words(emails["ExtractedBodyText"][3])
match = re.findall(r'[\w\.-]+@[\w\.-]+', emails["ExtractedBodyText"][3])
print match
match_new=re.sub("hrod17@clintonernailcom"|"Russorv@state.gov", ' ', emails["ExtractedBodyText"][3])
print match_new
emails["ExtractedBodyText"][3]

[u'hrod17@clintonernailcom', u'Russorv@state.gov']


TypeError: unsupported operand type(s) for |: 'str' and 'str'

In [35]:
# Get the number of emails based on the dataframe column size
num_emails = emails["ExtractedBodyText"].size

# Initialize an empty list to hold the clean reviews
clean_emails = []

# Loop over each email; create an index i that goes from 0 to the length
# of the emails 
for i in xrange( 0, num_emails ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Cleaning review %d of %d\n" % ( i+1, num_emails )                                                             
  
    # Call our function for each one, and add the result to the list of
    # clean reviews
    try:
        clean_emails.append( review_to_words( emails["ExtractedBodyText"][i] ) )
    except Exception as e:
        clean_emails.append( review_to_words("I'm a placeholder sentence."))
        print "Execption raised:", e

Cleaning review 1000 of 6742

Cleaning review 2000 of 6742

Cleaning review 3000 of 6742

Cleaning review 4000 of 6742

Cleaning review 5000 of 6742

Cleaning review 6000 of 6742



###Create a Bag-of-Words

In [44]:
clean_emails

[u'thursday march latest syr aid qaddaf sid hrc memo syr aid liby docx hrc memo syr aid liby docx march hil',
 u'thx',
 u'hrod clintonemail com friday march hum abedin latest syr aid qaddaf sid hrc memo syr aid liby docx pis print',
 u'pis print hrod clintonernailcom wednesday septemb russorv stat gov meet right wing extrem behind ant fvluslim film spark dead riot meat sent wednesday septemb subject meet right wing extrem behind ant muslim film spark dead riot htte maxbiumenth com meet right wing extrem behind ant musiim tihn spark dead riot sent verizon wireless lte droid',
 u'hrod clintonemail corn friday march hum abedin latest syr aid qaddaf sid hrc memo syr aid liby docx pis print',
 u'fyi',
 u'wednesday septemb fwd liby liby sept docx send direct sent verizon wireless lte druid',
 u'fyi',
 u'wednesday septemb fwd liby liby sept docx send direct sent verizon wireless lte druid',
 u'fyi',
 u'an mary slaught sunday march jacob mil cheryl reinesp stategov abedin hurtl piec liby nfz a

In [41]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = None) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_emails)

In [42]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab[:50]

[u'aaaaaaaabhm', u'aab', u'aaf', u'aar', u'aard', u'aaron', u'aaronovitch', u'aary', u'ab', u'aback', u'abandon', u'abb', u'abba', u'abbaw', u'abbot', u'abbrevy', u'abby', u'abc', u'abclel', u'abcnew', u'abd', u'abdel', u'abdelbaset', u'abdomin', u'abduc', u'abdulfatah', u'abduljalil', u'abdullah', u'abedin', u'abedinfl', u'abedinh', u'aber', u'abet', u'abey', u'abhor', u'abid', u'abingdon', u'abl', u'abm', u'abo', u'aboard', u'abol', u'abolit', u'abomin', u'abort', u'about', u'abr', u'abraham', u'abramoff', u'abramowitz']


In [123]:
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features.toarray(), axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist)[:100]:
    print tag, count

abandon 46
abba 46
abc 13
abdullah 8
abedin 213
abedinh 16
abid 9
abil 57
abl 123
aboard 7
abort 49
aboul 11
abr 15
abraham 13
abroad 38
absenc 26
absent 12
absolut 43
absorb 7
abu 21
abus 32
academ 28
academi 25
academia 7
acceler 12
accept 109
access 107
accid 8
accommod 17
accompani 14
accomplish 47
accord 226
accordingli 15
account 107
accur 14
accus 80
acheson 10
achiev 118
acknowledg 50
acorn 16
acquir 10
across 142
act 128
action 185
activ 143
activist 72
actor 21
actual 131
ad 177
adam 29
adapt 19
add 117
addit 145
address 200
adequ 20
adjust 21
adl 7
admin 7
administ 14
administr 441
admir 39
admit 36
adopt 48
adult 18
advanc 93
advantag 35
adversari 12
advertis 26
advic 44
advis 193
advisor 42
advisori 12
advoc 58
advocaci 30
af 33
afb 14
affair 154
affect 45
affili 17
affirm 16
afford 25
afghan 245
afghanistan 386
afp 22
afraid 11
africa 98
african 62
aftermath 13
afternoon 107
afterward 18
age 59
agenc 99
agenda 103
agent 22
aggress 35
ago 160
agre 219
agreement 167
agricul