In [7]:
import re
import operator
from pprint import pprint
from random import sample
from collections import Counter, Iterable

import numpy as np
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd

from gensim.models import Phrases

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

In [8]:
aliases = pd.read_csv("Aliases.csv")
receivers = pd.read_csv("EmailReceivers.csv")
emails = pd.read_csv("Emails.csv")
persons = pd.read_csv("Persons.csv")

In [9]:
print(aliases.shape)
print(receivers.shape)
print(emails.shape)
print(persons.shape)

(850, 3)
(9306, 3)
(7945, 22)
(513, 2)


In [10]:
aliases.head()

Unnamed: 0,Id,Alias,PersonId
0,1,111th congress,1
1,2,agna usemb kabul afghanistan,2
2,3,ap,3
3,4,asuncion,4
4,5,alec,5


In [11]:
receivers.head()

Unnamed: 0,Id,EmailId,PersonId
0,1,1,80
1,2,2,80
2,3,3,228
3,4,3,80
4,5,4,80


In [12]:
emails.head()

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...
2,3,C05739547,CHRIS STEVENS,;H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739547...,F-2015-04841,...,B6,"Mills, Cheryl D <MillsCD@state.gov>","Abedin, Huma","Wednesday, September 12, 2012 11:52 AM",F-2015-04841,C05739547,05/14/2015,RELEASE IN PART,Thx,UNCLASSIFIED\nU.S. Department of State\nCase N...
3,4,C05739550,CAIRO CONDEMNATION - FINAL,H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739550...,F-2015-04841,...,,"Mills, Cheryl D <MillsCD@state.gov>","Mitchell, Andrew B","Wednesday, September 12,2012 12:44 PM",F-2015-04841,C05739550,05/13/2015,RELEASE IN PART,,UNCLASSIFIED\nU.S. Department of State\nCase N...
4,5,C05739554,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"Abedin, Huma",H,80.0,2011-03-11T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739554...,F-2015-04841,...,,,,,F-2015-04841,C05739554,05/13/2015,RELEASE IN PART,"H <hrod17@clintonemail.com>\nFriday, March 11,...",B6\nUNCLASSIFIED\nU.S. Department of State\nCa...


In [13]:
persons.head()

Unnamed: 0,Id,Name
0,1,111th Congress
1,2,AGNA USEMB Kabul Afghanistan
2,3,AP
3,4,ASUNCION
4,5,Alec


In [14]:
def sort_dict(d):
    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)

In [15]:
sort_dict(Counter((" ".join(emails['ExtractedBodyText'].astype(str).tolist())).split(" ")))[:10]

[('the', 25616),
 ('to', 15707),
 ('of', 12753),
 ('and', 12449),
 ('a', 9781),
 ('in', 8903),
 ('that', 5542),
 ('is', 5403),
 ('for', 5278),
 ('on', 4598)]

In [16]:
body_text = emails[~emails['ExtractedBodyText'].isnull()]['ExtractedBodyText'].astype(str).tolist()

### Visualizaton

In [17]:
def plotly_scatter(x, y, labels):
    trace = go.Scatter(
        x=x, y=y,
        mode='markers',
        text=labels,
        hoverinfo='text',
        marker=dict(opacity=0.5, symbol='cross-dot', color='lightseagreen')
    )
    MARGIN = dict(l=0, r=30, b=0, t=30)
    layout = go.Layout(height=350, width=500, hovermode='closest', margin=MARGIN)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, show_link=False)

### Text preparation

In [18]:
stop_words = list(map(lambda x: x.strip(), 
                 open("stopwords.txt", "r").readlines()))

message_base = [x.lower() for x in body_text]

In [20]:
def tokenize(input_docs):
    tokenized_docs = []
    for j, mail in enumerate(input_docs):
        tokenized_sentences = []

        for sentence in mail.split("\n"):
            processed_str = sentence.split()
            tokenized_sentences.append(processed_str)
        tokenized_docs.append(tokenized_sentences)

    return tokenized_docs

tokenized_messages = tokenize(message_base)

In [21]:
re_alpha = re.compile(r'[^a-zA-Z]')


def remove_non_alpha(raw_texts):
    filtered_texts = []
    for raw_text in raw_texts:
        text = []
        for tokenized_sentence in raw_text:
            sentence = " ".join(tokenized_sentence)
            sentence = re_alpha.sub(' ', sentence).strip()
            text.append(sentence.split())

        filtered_texts.append(text)
    return filtered_texts


def remove_short_sentences(raw_texts):
    filtered_texts = []
    for raw_text in raw_texts:
        text = []
        for tokenized_sentence in raw_text:
            if len(tokenized_sentence) > 1:
                text.append(tokenized_sentence)
        if text:
            filtered_texts.append(text)
    return filtered_texts


def remove_dates(raw_texts):
    week_days = ['monday', 'tuesday', 'wednesday', 
                 'thursday', 'friday', 'saturday', 'sunday']
    filtered_texts = []
    for raw_text in raw_texts:
        text = []
        for tokenized_sentence in raw_text:
            if tokenized_sentence:
                first_word = re_alpha.sub(' ', tokenized_sentence[0]).strip()
                if not first_word in week_days:
                    text.append(tokenized_sentence)
        filtered_texts.append(text)
    return filtered_texts


def filter_tokens_by_condition(tokenized_texts, condition, debug=False):
    filtered_texts = []
    for raw_text in tokenized_texts:
        text = []
        for raw_sentence in raw_text:
            sentence = []
            for token in raw_sentence:
                if condition(token):
                    sentence.append(token)
                elif debug:
                    print(token)
            text.append(sentence)
        filtered_texts.append(text)
    return filtered_texts
    

def remove_stopwords(tokenized_texts):
    condition = lambda token: not (token in stop_words)
    return filter_tokens_by_condition(tokenized_texts, condition)


def remove_links(tokenized_texts):
    condition = lambda token: not ("http" in token or "www." in token)
    return filter_tokens_by_condition(tokenized_texts, condition)


def remove_emails(tokenized_texts):
    condition = lambda token: not ("@" in token)
    return filter_tokens_by_condition(tokenized_texts, condition)


def remove_shorts(tokenized_texts):
    condition = lambda token: len(token) > 1
    return filter_tokens_by_condition(tokenized_texts, condition)

In [22]:
filter_pipeline = [remove_emails, 
                    remove_dates,
                    remove_links,
                    remove_non_alpha,
                    remove_stopwords,
                    remove_short_sentences]

filtered_messages = tokenized_messages
for func in filter_pipeline:
    filtered_messages = func(filtered_messages)
    
len(filtered_messages)

5541

In [102]:
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el
       
tokens_counter = Counter(list(flatten(filtered_messages)))
sorted(tokens_counter.items(), key=operator.itemgetter(1), reverse=True)[:10]

[('state', 1927),
 ('pm', 1847),
 ('would', 1533),
 ('secretary', 1349),
 ('u', 1255),
 ('obama', 1238),
 ('said', 1234),
 ('one', 1178),
 ('president', 1126),
 ('office', 1113)]

### TF Vectorizer

In [23]:
tf_data = []

for j, mail in enumerate(filtered_messages):
    sentences = []
    for tokenized_sentence in mail:
        sentences.append(" ".join(tokenized_sentence))
    tf_data.append(". ".join(sentences))

In [24]:
vectorizer = TfidfVectorizer(max_features=3000, min_df=3)
X = vectorizer.fit_transform(tf_data)

X.shape

(5541, 3000)

In [31]:
def shrink_string(s, max_len=50):
    if len(s) > max_len:
        return s[:max_len] + "..."
    else:
        return s

In [32]:
samples = sample(list(zip(X.toarray(), tf_data)), 1000)

sample_points = [x[0] for x in samples]
sample_labels = [shrink_string(x[1]) for x in samples]

In [26]:
X_embedded = TSNE(n_components=2).fit_transform(sample_points)

In [33]:
plotly_scatter([x[0] for x in X_embedded], 
         [x[1] for x in X_embedded], sample_labels);

### Doc2Vec

In [34]:
from collections import namedtuple
import re

TaggedSentence = namedtuple('SentimentDocument', 'words tags')

tagged_sentences = []

for j, mail in enumerate(filtered_messages):
    for sentence in mail:
        tagged_sentences.append(TaggedSentence(sentence, 
                                               ["doc_{}".format(j)]))

In [84]:
from gensim.models import Doc2Vec, Word2Vec

model = Doc2Vec(vector_size=300, min_count=6, window=6, workers=4)
model.build_vocab(tagged_sentences)
n_epochs = 6

len(model.wv.vocab.keys())

7076

In [85]:
for epoch in range(n_epochs):
    print("epoch", epoch + 1)
    model.train(tagged_sentences, total_examples=len(tagged_sentences), epochs=n_epochs)

epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6


In [86]:
X = []
for k in range(len(model.docvecs)):
    X.append(model.docvecs["doc_{}".format(str(k))])

In [87]:
samples = sample(list(zip(X, tf_data)), 1000)

sample_points = [x[0] for x in samples]
sample_labels = [shrink_string(x[1]) for x in samples]

In [88]:
X_embedded = TSNE(n_components=2).fit_transform(sample_points)

In [89]:
plotly_scatter([x[0] for x in X_embedded], 
         [x[1] for x in X_embedded], sample_labels);