In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from helpers import *
import email
from nltk.tokenize.regexp import RegexpTokenizer
import itertools
import re
from nltk.corpus import stopwords
import string
import nltk
from sklearn.metrics.pairwise import linear_kernel

## Load the Enron dataset

In [None]:
emails_df = pd.read_csv('emails.csv')

In [None]:
emails_df.head()

In [None]:
print('number of documents in the Enron dataset are:', len(emails_df), 'we will take a subset of this.')

## Take a subset of the dataset for exploration

In [None]:
emails_df_sub = emails_df.sample(5000, random_state=1).reset_index(drop=True)

In [None]:
emails_df_sub['message'][1]

## Pre-processing the data

We would like to parse the message column into the respective columns within it. 
1. Make a new column for each key in the message column
2. Split the email addresses 
3. Extract user name from file name

In [None]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df_sub['message']))
emails_df_sub.drop('message', axis=1, inplace=True)

In [None]:
messages[0].items()

In [None]:
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df_sub[key] = [doc[key] for doc in messages]

In [None]:
# Parse content from emails
emails_df_sub['content'] = list(map(get_text_from_email, messages))

In [None]:
# Split multiple email addresses
emails_df_sub['From'] = emails_df_sub['From'].map(split_email_addresses)
emails_df_sub['To'] = emails_df_sub['To'].map(split_email_addresses)

In [None]:
# Extract the root of 'file' as 'user'
emails_df_sub['user'] = emails_df_sub['file'].map(lambda x:x.split('/')[0])
del messages

In [None]:
emails_df_sub.head()

In [None]:
print('shape of the dataframe:', emails_df_sub.shape)
# Find number of unique values in each columns
for col in emails_df_sub.columns:
    print(col, emails_df_sub[col].nunique())

## Group the emails by the user 

In [None]:
emails_df_sub_grouped = emails_df_sub.groupby('user')['content'].apply(lambda x: x.sum()).reset_index().groupby('user')['content'].apply(lambda x: x.sum()).reset_index()

## More pre-processing

Now, we can pre-process the emails by doing the following:
1. removing website urls
2. standardizing words
3. removing puncuation
4. remove all numbers
5. removing stop words
6. removing chat words

In [None]:
stop = stopwords.words('english')
exclude = set(string.punctuation)
exclude.update({'\n', '\t'})
chat_words_lower = sorted(set(w.lower() for w in nltk.corpus.nps_chat.words()))

In [None]:
def clean(doc):
    try:
        doc['content'] = ''.join(''.join(s)[:2] for _, s in itertools.groupby(doc['content']))
        doc['content'] = re.sub(r'http\S+', '', doc['content'])
        doc['content'] = ''.join([i for i in doc['content'] if not i.isdigit()])
        doc['content'] = ' '.join([i for i in doc['content'].lower().split(' ') if i not in stop])
        doc['content'] = ' '.join([i for i in doc['content'].lower().split(' ') if i not in chat_words_lower])
        doc['content'] = ''.join(ch for ch in doc['content'] if ch not in exclude)
    except:
        doc['content'] = ''
    return doc['content']

In [None]:
emails_df_sub_grouped['content'] = emails_df_sub_grouped.apply(clean, axis=1)

In [None]:
emails_df_sub_grouped.head()

In [None]:
training = emails_df_sub_grouped.sample(frac=0.7, random_state=0).reset_index(drop=True)
testing = emails_df_sub_grouped.loc[~emails_df_sub_grouped.index.isin(training.index)].reset_index(drop=True)

In [None]:
testing

In [None]:
training_corpus = training.values.tolist()
testing_corpus = testing.values.tolist()

## Applying CountVectorizer

In [None]:
# import
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# instantiate
vect = CountVectorizer(analyzer='word', ngram_range=(1,2), min_df=0.5, max_df=0.95, stop_words = 'english')

In [None]:
# fit & transform
tf_matrix = vect.fit_transform([content for file, content in training_corpus])

print('Number of documents:', tf_matrix.shape[0], ', number of features:', tf_matrix.shape[1])

In [None]:
vect.get_feature_names()

In [None]:
# convert sparse matrix to a dense matrix
tf_matrix.toarray()

In [None]:
# examine the vocabulary and document-term matrix together
feature_matrix = pd.DataFrame(tf_matrix.toarray(), columns=vect.get_feature_names())
feature_matrix

## Applying TF_IDF

The goal of using tf-idf instead of the raw frequencies of occurrence of a
    token in a given document is to scale down the impact of tokens that occur
    very frequently in a given corpus and that are hence empirically less
    informative than features that occur in a small fraction of the training
    corpus.
    
 The formula that is used to compute the tf-idf of term t is
    tf-idf(d, t) = tf(t) * idf(d, t), and the idf is computed as
    idf(d, t) = log [ n / df(d, t) ] + 1 (if ``smooth_idf=False``),
    where n is the total number of documents and df(d, t) is the
    document frequency; the document frequency is the number of documents d
    that contain term t. The effect of adding "1" to the idf in the equation
    above is that terms with zero idf, i.e., terms  that occur in all documents
    in a training set, will not be entirely ignored.
    (Note that the idf formula above differs from the standard
    textbook notation that defines the idf as
    idf(d, t) = log [ n / (df(d, t) + 1) ]).

In [None]:
# import
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# instantiate
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0.5, max_df=0.95, stop_words = 'english', norm='l2')

In [None]:
# fit & transform
tfidf_matrix = tf.fit_transform([content for file, content in training_corpus])

print('Number of documents:', tfidf_matrix.shape[0], ', number of features:', tfidf_matrix.shape[1])

In [None]:
tf.get_feature_names()

In [None]:
# convert sparse matrix to a dense matrix
tfidf_matrix.toarray()

In [None]:
# examine the vocabulary and document-term matrix together
feature_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=tf.get_feature_names())
feature_matrix

## Testing Similarity using Cosine

For this part, we are taking an observation outside of the training set (i.e. in the testing set) and t

In [None]:
testing_corpus[13]

In [None]:
new_email = [testing_corpus[13]]

In [None]:
new = tf.transform([content for file, content in new_email])

In [None]:
def find_similar(tfidf_matrix, new_movie, top_n = 1000):
    cosine_similarities = linear_kernel(new_movie, tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1]]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [None]:
similarity = pd.DataFrame()
for index, score in find_similar(tfidf_matrix, new):
    similarity = similarity.append(pd.DataFrame({'similarity_score':score, 'person':training_corpus[index][0]}, index=[0]), ignore_index=True)
similarity.head()

http://www.foxnews.com/story/2004/07/08/fast-facts-key-enron-players.html