In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import gensim 
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD
from matplotlib.pyplot import figure
from matplotlib.figure import Figure
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
import re
from stop_words import get_stop_words
stop_words = get_stop_words('en')
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sb
import random
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

   # Exploratory Analysis and Topic Modelling

*Daan van Kooten 09-2018
*

**This Kernel consists of:**
1. a quick exploratory analysis of the Clinton e-mails
2. Analyzing textbody with TF-IDF to find clusters of mails
3. Latent semantic analysis (LSA), analyzing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms. 

***



## **Exporatory Analysis**

Main topics:
*  How many emails were send
*  Who received how many emails.
*  Who send how many emails.
*  When are the emails send

**Import and select relevant data**

Select subject, bodytext, receiver and sender information.

In [None]:
data_mail = pd.read_csv('../input/Emails.csv', header=0)
data_mail.head(5)

In [None]:
timeStamp_split = data_mail['MetadataDateSent'].str.split("T")
Times = timeStamp_split.str[1]

data_mail['Dates'] = timeStamp_split.str[0]
data_mail['Times'] = Times.str.split("+").str[0]

data_mail['Dates'] = pd.Series(data_mail['Dates'])
data_mail['Times'] = pd.Series(data_mail['Times'])

data_sub = data_mail[["MetadataSubject", "ExtractedBodyText", "MetadataTo", "MetadataFrom","Times","Dates"]]
data_sub.head(5)

In [None]:
print('The total number of emails sent is', len(data_sub))

In [None]:
figure(figsize =(20,6))
data_sub['MetadataTo'].value_counts()[0:19].plot('bar') #generate top 20 persons emails

**3. Who send how many emails.
**

In [None]:
figure(figsize =(20,6))
data_sub['MetadataFrom'].value_counts()[0:19].plot('bar') #generate top 20 persons emails

**4. When are the emails send and received**

In [None]:
figure(figsize =(30,6))
countDates = data_sub.Dates.groupby([data_sub.Dates]).agg(['count'])
type(countDates)
list(countDates)
countDates2= [go.Scatter(x = list(countDates.index), y=countDates['count'])]

layout = dict(title = 'Count mails per day',
              xaxis= dict(title= 'year',ticklen= 10,zeroline= False)
             )
fig = dict(data = countDates2, layout = layout)
plotly.offline.iplot(fig)

In [None]:
## **Find similar words in corpus using Word2vec :**

#Word2vec is a two-layer neural net that processes text. Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus. Word2vec is a prediction based model rather than frequency. It uses predictive analysis to make a weighted guess of a word co-occurring with respect to it’s neighbouring words.

In [None]:
data_rel  = data_sub[pd.notnull(data_sub['ExtractedBodyText'])]
print(data_rel.head(5)['ExtractedBodyText'])

In [None]:
moreThan4Words = data_rel['ExtractedBodyText'].str.split().apply(len) > 4
data_rel = data_rel[list(moreThan4Words)]
len(data_rel)

In [None]:
words = gensim.utils.simple_preprocess (str(data_rel['ExtractedBodyText']))
words2 = [word_tokenize(i) for i in data_rel['ExtractedBodyText']]

In [None]:
 model = gensim.models.Word2Vec(
            words2,
            size=350,
            window=10,
            min_count=15,
            workers=30)
model.train(words, total_examples=len(words), epochs=10)

#model.wv.vocab

**Example: 'Minister'**

In [None]:
#model.wv.most_similar(positive = 'Minister')

# Preprocessing data
* Lower casing
* Punctuation removal
* Stopwords removal
* Remove numbers

In [None]:
type(data_rel.ExtractedBodyText)
data_rel["ExtractedBodyText"] = data_rel.ExtractedBodyText.apply(lambda x : str.lower(x))#tolower
data_rel["ExtractedBodyText"] = data_rel["ExtractedBodyText"].apply(lambda x : " ".join(re.findall('[\w]+',x)))#remove punctuation

def remove_stopWords(s):
    '''For removing stop words
    '''
    s = ' '.join(word for word in s.split() if word not in stop_words)
    return s

data_rel["ExtractedBodyText"]= data_rel["ExtractedBodyText"].apply(lambda x: remove_stopWords(x)) # remove stopwords
data_rel["ExtractedBodyText"] = data_rel["ExtractedBodyText"].str.replace('\d+', '') # remove numbers

print(data_rel["ExtractedBodyText"])


In [None]:
wordcloud2 = WordCloud(width = 550, height = 500, max_font_size=50, max_words=100, background_color="white").generate(' '.join(data_rel["ExtractedBodyText"]))# Generate plot
plt.imshow(wordcloud2,interpolation='bilinear')
plt.axis("off")
plt.show()

## **Analyzing textbody with TF-IDF to find clusters of mails**

 TF-IDF  is short for term frequency–inverse document frequency and is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. F
 
*  Firstly I made a quick plot to visualize this matrix. To do this I first needed to make a 3d representation of the DTM (document-term matrix) using PCA. 
* Secondly, I wanted to find out what the top keywords were in all the emails. 
*  Thirdly KMeans is a popular clustering algorithm used in machine learning, where K stands for the number of clusters. I created a KMeans classifier with 8 clusters and 10000 iterations. Because I now knew which emails were assigned to each cluster, I was able to extract the top terms per cluster.

In [None]:
vect = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=2,ngram_range=(1,2))
X = vect.fit_transform(data_rel.ExtractedBodyText)

In [None]:
X_dense = X.todense()
pca = PCA(n_components = 3)
coords = pca.fit_transform(X_dense)


In [None]:
trace1 = go.Scatter3d(
    x=coords[:, 0],
    y=coords[:, 1],
    z=coords[:, 2],
    mode='markers',
    marker=dict(
        size=12,
        opacity=0.8
    )
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

The provided plot looks rather fancy however, when you look at the scree plot of the PCA you can see that only 1% of variance is explained for a PCA with only 3 components:

In [None]:
#Explained variance
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
def top_tfidf_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['features', 'score'])
    return df
def top_feats_in_doc(X, features, row_id, top_n=25):
    row = np.squeeze(X[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [None]:
features = vect.get_feature_names()
top_feats_in_doc(X, features, 3, 10)

In [None]:
def top_mean_feats(X, features,
 grp_ids=None, min_tfidf=0.1, top_n=25):
    if grp_ids:
        D = X[grp_ids].toarray()
    else:
        D = X.toarray()
        D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [None]:
top_mean_feats(X, features, top_n=30)

In [None]:
n_clusters = 8
clf = KMeans(n_clusters=n_clusters, max_iter=10000, init='k-means++', n_init=1)
labels = clf.fit_predict(X)
#print(labels)

X_dense = X.todense()
coords = PCA(n_components=3).fit_transform(X_dense)

trace1 = go.Scatter3d(
    x=coords[:, 0],
    y=coords[:, 1],
    z=coords[:, 2],
    mode='markers',
    marker=dict(
        size=12,
        opacity=0.8,
        color = labels
    )
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
def top_feats_per_cluster(X, y, features, min_tfidf=0.1, top_n=25):
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label) 
        feats_df = top_mean_feats(X, features, ids,    min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [None]:
print(top_feats_per_cluster(X, labels, features, min_tfidf=0.1, top_n=5))


## **LSA**

Latent semantic analysis (LSA) is a technique in natural language processing, in particular distributional semantics, of analyzing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms. LSA assumes that words that are close in meaning will occur in similar pieces of text (the distributional hypothesis). A matrix containing word counts per paragraph (rows represent unique words and columns represent each paragraph) is constructed from a large piece of text and a mathematical technique called singular value decomposition (SVD) is used to reduce the number of rows while preserving the similarity structure among columns. Words are then compared by taking the cosine of the angle between the two vectors (or the dot product between the normalizations of the two vectors) formed by any two rows. Values close to 1 represent very similar words while values close to 0 represent very dissimilar words.

I created 15 different possible clusters of mails with their concepts according LSA. Further Analysis to the different conceptual clusters might be interesting.

In [None]:
#Preprocessing
small_count_vectorizer = CountVectorizer(stop_words='english', max_features=40000)
X2 = small_count_vectorizer.fit_transform(data_rel.ExtractedBodyText)
n_topics = 50

In [None]:
lsa_model = TruncatedSVD(n_components= n_topics, n_iter =350)

In [None]:
# Define helper functions
def get_keys(topic_matrix):
    '''returns an integer list of predicted topic categories for a given topic matrix'''
    keys = []
    for i in range(topic_matrix.shape[0]):
        keys.append(topic_matrix[i].argmax())
    return keys

def keys_to_counts(keys):
    '''returns a tuple of topic categories and their accompanying magnitudes for a given list of keys'''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

In [None]:
lsa_topic_matrix = lsa_model.fit_transform(X2)

In [None]:
lsa_keys = get_keys(lsa_topic_matrix)
lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
print(lsa_keys)

In [None]:
# Define helper functions
def get_top_n_words(n, keys, document_term_matrix, count_vectorizer):
    '''returns a list of n_topic strings, where each string contains the n most common 
        words in a predicted category, in order'''
    top_word_indices = []
    for topic in range(n_topics):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words

In [None]:
top_n_words_lsa = get_top_n_words(10, lsa_keys, X2, small_count_vectorizer)

for i in range(len(top_n_words_lsa)):
    print("Topic {}: ".format(i), top_n_words_lsa[i])

In [None]:
top_3_words = get_top_n_words(3, lsa_keys, X2, small_count_vectorizer)
labels = ['Topic {}: \n'.format(i) + top_3_words[i] for i in lsa_categories]

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(lsa_categories, lsa_counts)
ax.set_xticks(lsa_categories)
ax.set_title('LSA Topic Category Counts')

In [None]:
def get_mean_topic_vectors(keys, two_dim_vectors):
    '''returns a list of centroid vectors from each predicted topic category'''
    mean_topic_vectors = []
    for t in range(n_topics):
        articles_in_that_topic = []
        for i in range(len(keys)):
            if keys[i] == t:
                articles_in_that_topic.append(two_dim_vectors[i])    
        
        articles_in_that_topic = np.vstack(articles_in_that_topic)
        mean_article_in_that_topic = np.mean(articles_in_that_topic, axis=0)
        mean_topic_vectors.append(mean_article_in_that_topic)
    return mean_topic_vectors

In [None]:
from sklearn.manifold import TSNE

tsne_lsa_model = TSNE(n_components=2, perplexity=50, learning_rate=100, 
                        n_iter=1500, verbose=1, random_state=0, angle=0.75)
tsne_lsa_vectors = tsne_lsa_model.fit_transform(lsa_topic_matrix)

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()


In [None]:
## create colormap
def rand_Hexcode(n):
    for i in range(n):
        r = lambda: random.randint(0,255)
        hcode =  '#%02X%02X%02X' % (r(),r(),r())
        yield hcode
        
colormap = np.fromiter(rand_Hexcode(n_topics), dtype='U25', count=n_topics)

In [None]:
top_3_words_lsa = get_top_n_words(3, lsa_keys, X2, small_count_vectorizer)
lsa_mean_topic_vectors = get_mean_topic_vectors(lsa_keys, tsne_lsa_vectors)

#plot = figure(title="t-SNE Clustering of {} LSA Topics".format(n_topics), plot_width=1500, plot_height=900)
#plot.scatter(x=tsne_lsa_vectors[:,0], y=tsne_lsa_vectors[:,1], color=colormap[lsa_keys])

#for t in range(n_topics):
 #   label = Label(x=lsa_mean_topic_vectors[t][0], y=lsa_mean_topic_vectors[t][1], 
  #                text=top_3_words_lsa[t], text_color=colormap[t])
  #  plot.add_layout(label)
    
#show(plot)

trace1 = go.Scattergl(
        x=np.array(tsne_lsa_vectors[:,0]),
        y=np.array(tsne_lsa_vectors[:,1]),
        mode='markers',
        marker=dict(
            size=5,
             color = colormap[lsa_keys]
        )
    )
for t in range(n_topics):
    trace2 = go.Scatter(
        x=np.array(list(zip(*lsa_mean_topic_vectors))[0]),
        y=np.array(list(zip(*lsa_mean_topic_vectors))[1]),
        mode='markers+text',
        text = np.array(top_3_words_lsa[0:n_topics]),
         textfont=dict(
        family='sans serif',
        size=15,
        color=colormap[0:n_topics]
        ))
    
type(lsa_mean_topic_vectors) 
[item[0] for item in lsa_mean_topic_vectors]
data = [trace1,trace2]

layout = go.Layout(#title="t-SNE Clustering of {} LSA Topics".format(n_topics),
   margin=dict(
           l=0,
           r=0,
           b=0,
           t=0
       
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
reindexed_data = data_rel["ExtractedBodyText"]
reindexed_data.index = data_rel["Dates"]
reindexed_data.index  = pd.to_datetime(reindexed_data.index )
data_range = pd.date_range('2009-07-01','2010-04-01' , freq='1M')


type(reindexed_data)

yearly_data = []
for i in data_range: # range(2009,2012+1):
    yearly_data.append(reindexed_data['{}'.format(i)].as_matrix())
    
yearly_topic_matrices = []
for monthly in yearly_data:
    document_term_matrix = small_count_vectorizer.transform(monthly)
    topic_matrix = lsa_model.transform(document_term_matrix)
    yearly_topic_matrices.append(topic_matrix)
    
yearly_keys = []
for topic_matrix in yearly_topic_matrices:
    yearly_keys.append(get_keys(topic_matrix))
    
yearly_counts = []
for keys in yearly_keys:
    categories, counts = keys_to_counts(keys)
    yearly_counts.append(counts)

yearly_topic_counts = pd.DataFrame((yearly_counts), index=data_range)
yearly_topic_counts = yearly_topic_counts.add_prefix('Topic')
print(yearly_topic_counts)

In [None]:
labels = ['Topic {}: \n '.format(i) + ' '.join([topic.split() for topic in top_n_words_lsa][i][:3]) for i in range(n_topics)]

fig, ax = plt.subplots(figsize=(14,10))
sb.heatmap(yearly_topic_counts, xticklabels=labels, ax=ax, annot=True, annot_kws={"size": 12})