In [None]:
#Imports the libraries and read the data files

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import os, sys, email
import gensim
from gensim.models import Doc2Vec
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
from string import punctuation
import timeit
from sklearn.cluster import KMeans
from sklearn import metrics
import pylab as pl
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
start = timeit.default_timer()

emails_df = pd.read_csv('../input/emails.csv', nrows=50000)

In [None]:
## This code in this and next cell is taken from the from Explore Enron notebook by Zichen Wang
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [None]:
#Data Preparation
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
# Parse content from emails
emails_df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages

emails_df['Subjcontent'] = emails_df['Subject'] + " " + emails_df['content']
emails_df.head(5)

The purpose of below functions are:
1. Tokenization 
2. Remove URLs
3. Remove email address
4. Remove tags
5. Remove puntuations
6. Remove stop words
7. Apply Stemming

In [None]:
#Data cleaning
def email_cleaning(text):
    email = text.lower()    
    # clean and tokenize document string
    email_content = email.split()    
    word_list = []
    for i in email_content:
        x = 0
        if (('http' not in i) and ('@' not in i) and ('<.*?>' not in i) and i.isalnum() and (not i in stop_words)):
            word_list += [i]
        
    return word_list 

#Data Pre-processing
def preprocessing(text):    
    # remove numbers
    number_tokens = [re.sub(r'[\d]', ' ', i) for i in text]
    number_tokens = ' '.join(number_tokens).split()
     # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
    # remove empty
    length_tokens = [i for i in stemmed_tokens if len(i) > 1]
    return length_tokens

**Create a list of tagged emails. **

In [None]:
LabeledSentence1 = gensim.models.doc2vec.TaggedDocument
all_content = []
texts = []
j=0
k=0
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
p_stemmer = PorterStemmer()
for em in emails_df.Subjcontent:           
    #Data cleaning
    clean_content = email_cleaning(em)
    
    #Pre-processing
    processed_email = preprocessing(clean_content)
    
    # add tokens to list
    if processed_email:
        all_content.append(LabeledSentence1(processed_email,[j]))
        j+=1
        
    k+=1

print("Number of emails processed: ", k)
print("Number of non-empty emails vectors: ", j)

**Printout the sample processed email**

In [None]:
print(all_content[278])

**Create a model using Doc2Vec and train it**

In [None]:
d2v_model = Doc2Vec(all_content, size = 2000, window = 10, min_count = 500, workers=7, dm = 1, 
                alpha=0.025, min_alpha=0.001)

In [None]:
d2v_model.train(all_content, total_examples=d2v_model.corpus_count, epochs=10, start_alpha=0.002, end_alpha=-0.016)

**Print the emails similar to email with tagged id as 1 **

In [None]:
# shows the similar docs with id = 99
print (d2v_model.docvecs.most_similar(1))
#print(all_content[282])
#print(all_content[1])

**Apply K-means clustering on the model**

In [None]:
kmeans_model = KMeans(n_clusters=4, init='k-means++', max_iter=100)  
X = kmeans_model.fit(d2v_model.docvecs.doctag_syn0)
labels=kmeans_model.labels_.tolist()

In [None]:
l = kmeans_model.fit_predict(d2v_model.docvecs.doctag_syn0)
pca = PCA(n_components=2).fit(d2v_model.docvecs.doctag_syn0)
datapoint = pca.transform(d2v_model.docvecs.doctag_syn0)

**Plot the clustering result**

In [None]:
plt.figure
label1 = ["#FFFF00", "#008000", "#0000FF", "#800080"]
color = [label1[i] for i in labels]
plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)

centroids = kmeans_model.cluster_centers_
centroidpoint = pca.transform(centroids)
plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000')
plt.show()

In [None]:
stop = timeit.default_timer()
execution_time = stop - start

print(execution_time) #It returns time in sec