# SMART EMAIL CLASSIFIER

## Analyze Data
The goal of this notebook is to analyze the data.

## 1) Import libraries

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import random
from gensim import corpora
from gensim.models.ldamulticore import LdaModel as Lda
import csv
from wordcloud import WordCloud
import pyLDAvis.gensim
import matplotlib.gridspec as gridspec
import re
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
# from sklearn.manifold import TSNE
from gensim.models import Word2Vec 
from nltk.cluster import KMeansClusterer
import nltk
from sklearn import cluster



### 2) Functions to load the data

In [None]:
def reading_data(infile):
    return [line.rstrip('\n').split(' ') for line in open(infile, 'r')]



### 3) Functions to run the clustering models

In [None]:
def LDA(emails,num_groups,n_sample,plot_LDA,area):
    # Creating term dictionary of corpus, where each unique term is assigned an index. 
    dictionary = corpora.Dictionary(emails)
    corpus = [dictionary.doc2bow(text) for text in emails] 
    # Filter terms which occurs in less than 4 articles & more than 40% of the articles 
    # dictionary.filter_extremes(no_below=4, no_above=0.4)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(email) for email in emails]
        
    # Creating the object for LDA model using gensim library & Training LDA model on the document term matrix.
    print('    Running LDA')   
    ldamodel = Lda(doc_term_matrix, num_topics=num_groups, id2word = dictionary, passes=50, iterations=500)
    
#     Saving the model
    try:
        os.mkdir('../data/analysis')
    except:
        pass
    try:
        os.mkdir('../data/analysis/LDA')
    except:
        pass
    if area != None:
        anadir = '../data/analysis/LDA/%s'%area
    else:
        anadir = '../data/analysis/LDA/all_areas'
    try:
        os.mkdir(anadir)
    except:
        pass
    try:
        os.mkdir(anadir+'/%s_groups'%num_groups)
    except:
        pass

    
    ldafile = open(anadir+'/%s_groups/lda_model_%semails.pkl'%(num_groups,n_sample),'wb')
    pickle.dump(ldamodel,ldafile)
    ldafile.close()
 
    
    # Save all the words and frequencies for each topic    
    with open(anadir+'/%s_groups/all_groups.csv'%num_groups, 'w', newline='') as g:
        for i,topic in enumerate(ldamodel.print_topics(num_topics=num_groups, num_words=100)):
            with open(anadir+'/%s_groups/group_%s_with_freqs.csv'%(num_groups,i), 'w', newline='') as f:
                words = topic[1].split("+")
                f.write(",".join(words))
                words_nofreqs = [word.split("*")[1].replace(' ','').replace('"','') for word in topic[1].split("+")]
                g.write(" ".join(words_nofreqs)+'\n')
                
    clusters = [line.strip('\n') for line in open(anadir+'/%s_groups/all_groups.csv'%num_groups)]
    for i in range(num_groups):
        for j in range(num_groups):
             if i < j:
                    print('       Similarity between group %s and %s'%(i+1,j+1),get_jaccard_sim(clusters[i], clusters[j]))
           
    if plot_LDA == True:
        print('    Plotting LDA results')
        plotting_LDA(ldamodel,num_groups, doc_term_matrix, dictionary, area)
    return ldamodel

def load_LDA_model(emails,num_groups,n_sample,plot_LDA,area):
    if area != None:
        ldafolder = '../data/analysis/LDA/%s'%area
    else:
        ldafolder = '../data/analysis/LDA/all_areas'
    ldamodel = pickle.load(open(ldafolder+'/%s_groups/lda_model_%semails.pkl'%(num_groups,n_sample), 'rb'))
    if plot_LDA == True:
        dictionary = corpora.Dictionary(emails)
        corpus = [dictionary.doc2bow(text) for text in emails] 
        # Filter terms which occurs in less than 4 articles & more than 40% of the articles 
        # dictionary.filter_extremes(no_below=4, no_above=0.4)
        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        doc_term_matrix = [dictionary.doc2bow(email) for email in emails]
        plotting_LDA(ldamodel,num_groups, doc_term_matrix, dictionary, area)
#     clusters = [line.strip('\n') for line in open(ldafolder+'/%s_groups/all_groups.csv'%num_groups)]
#     for i in range(num_groups):
#         for j in range(num_groups):
#              if i < j:
#                     print('       Similarity between group %s and %s'%(i+1,j+1),get_jaccard_sim(clusters[i], clusters[j]))
    

    return ldamodel

def get_jaccard_sim(str1, str2): 
    # This function measures the similarity between two texts
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def Kmeans_TFIDF(emails_clean,n_clusters,plot_Kmeans,area):
    print('  Running K-means clustering using TF-IDF')
    df_clean = pd.DataFrame(columns=['message'])
    for email in emails_clean:
            df_clean = df_clean.append(pd.DataFrame({'message':[' '.join(email)]}))
    vect = TfidfVectorizer(analyzer='word',lowercase = False, max_df=0.3, min_df=2)

    X = vect.fit_transform(df_clean['message'])
    features = vect.get_feature_names()
    batch_size = 500
    clf = MiniBatchKMeans(n_clusters=n_clusters, init_size=1000, batch_size=batch_size, max_iter=100)  
    clf.fit(X)
    labels = clf.fit_predict(X)
    if plot_Kmeans == True:
        print('   Plotting results from K-means clustering')
        plot_tfidf_classfeats_h(top_feats_per_cluster(X, labels, features, 0.1, 10),area,num_groups)
        if area!= None:
            # We comment the next one as it kills the kernel, too much memory to plot all the points (one per mail)
            plot_Kmeans_PCA(X, clf, labels, features, area, num_groups)
            
def Kmeans_word2vec(sentences,n_clusters,plot_Kmeans,area):
    print('  Running K-means clustering using word embeddings')
    model = Word2Vec(sentences, min_count=1)
     
    # get vector data
    X = model[model.wv.vocab]
    kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
     
    words = list(model.wv.vocab)
    for i, word in enumerate(words):  
        print (word + ":" + str(assigned_clusters[i]))  
    kmeans = cluster.KMeans(n_clusters=n_clusters)
    kmeans.fit(X)     
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    if plot_Kmeans == True:
        print('   Plotting results from K-means clustering')
        plot_tfidf_classfeats_h(top_feats_per_cluster(X, labels, features, 0.1, 10),area,num_groups)
        if area!= None:
            # We comment the next one as it kills the kernel, too much memory to plot all the points (one per mail)
#             plot_Kmeans_PCA(X, clf, labels, features, area, n_clusters)            
             print('    Finished plotting')
    


### 4) Functions to visualize the output clusters

In [None]:
def plotting_LDA(ldamodel,num_groups, doc_term_matrix, dictionary,area):
    # Plotting the first 9 groups
    height=1+3*math.ceil(num_groups/3)
    fig = plt.figure(figsize=(18.,height))
    nh, nv = 3,math.ceil(num_groups/3)
    gs = gridspec.GridSpec(nv, nh)  
    i_plot=0
    for t in range(num_groups):
        ax = fig.add_subplot(gs[int(i_plot / nh), i_plot % nh])
        ax.imshow(WordCloud().fit_words(dict(ldamodel.show_topic(t, 100))))
        ax.axis("off")
        ax.set_title("Topic %s"%t)
        i_plot += 1
    try:
        os.mkdir('../figures')
    except:
        pass
    if area != None:
        figdir = '../figures/%s'%area
    else:
        figdir = '../figures'
    try:
        os.mkdir(figdir)
    except:
        pass
    plt.savefig(figdir+'/wordclouds_%sgroups.pdf'%num_groups, bbox_inches='tight',dpi=72)
    data = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
    pyLDAvis.enable_notebook()
    pyLDAvis.save_html(data, figdir+'/lda_%sgroups.html'%num_groups)



def plot_tfidf_classfeats_h(dfs,area,num_groups):
    if area != None:
        figdir = '../figures/%s'%area
    else:
        figdir = '../figures'
    try:
        os.mkdir(figdir)
    except:
        pass
    fig = plt.figure(figsize=(15, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("cluster = %s"%(i+1), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.score, align='center', color='#7530FF')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.features)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.savefig(figdir+'/tfidf_%sgroups.pdf'%num_groups, bbox_inches='tight',dpi=72)
    plt.show()
    
def plot_Kmeans_PCA(X, clf, labels, features, area, num_groups):
    X_dense = X.todense()
    pca = PCA(n_components=2).fit(X_dense)
    coords = pca.transform(X_dense)
    label_colors = ["#2AB0E9", "#2BAF74", "#D7665E", "#CCCCCC", 
                "#D2CA0D", "#522A64", "#A3DB05", "#FC6514",'#FF3030']
    colors = [label_colors[i] for i in labels]
    
    plt.scatter(coords[:, 0], coords[:, 1], c=colors)
    # Plot the cluster centers
    centroids = clf.cluster_centers_
    centroid_coords = pca.transform(centroids)
    plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker='X', s=200, linewidths=2, c=label_colors,edgecolors='black')
    plt.savefig('../figures/%s/kmeans_tfidf_%sgroups.pdf'%(area,num_groups), bbox_inches='tight',dpi=72)
    plt.show()

def top_tfidf_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['features', 'score'])
    return df

def top_feats_in_doc(X, features, row_id, top_n=25):
    row = np.squeeze(X[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(X, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    if grp_ids:
        D = X[grp_ids].toarray()
    else:
        D = X.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)
        
def top_feats_per_cluster(X, y, features, min_tfidf=0.1, top_n=25):
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label) 
        feats_df = top_mean_feats(X, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs   

### 5) Main function

In [None]:
if __name__ == '__main__':

    by_area = False
    
    run_LDA = False
    load_LDA = False
    plot_LDA = False
    
    run_TFIDF = False
    run_embeddings = True
    plot_Kmeans = True

    
    num_groups = 9    # number of clusters for the unsupervised clustering

    
    if by_area == True:
        for area in ['Trade','Legal','Risk','Finance','Business','Government','Energy','Admin','OperatingOfficer','HR','Logistics']:
            print('\nAnalizing area %s'%area)
            infile = "../data/preprocessed/preprocessed_%s_pos.csv"%area
            emails_clean = reading_data(infile)
            n_sample = len(emails_clean)
            if run_LDA == True:
                ldamodel = LDA(emails_clean,num_groups,n_sample,plot_LDA,area)
            if load_LDA == True:
                ldamodel = load_LDA_model(emails_clean,num_groups,n_sample,plot_LDA,area)
            if run_TFIDF == True:
                Kmeans_TFIDF(emails_clean,num_groups,plot_Kmeans,area)
            if run_embeddings == True:
                Kmeans_word2vec(emails_clean,num_groups,plot_Kmeans,area)



            
    else:
        infile = "../data/preprocessed/preprocessed_pos.csv"
        emails_clean = reading_data(infile)
        n_sample = len(emails_clean)
        area = None    
        if run_LDA == True:
            ldamodel = LDA(emails_clean,num_groups,n_sample,plot_LDA,area)
        if load_LDA == True:
            ldamodel = load_LDA_model(emails_clean,num_groups,n_sample,plot_LDA,area)
        if run_TFIDF == True:
            Kmeans_TFIDF(emails_clean,num_groups,plot_Kmeans,area)
        if run_embeddings == True:
            Kmeans_word2vec(emails_clean,num_groups,plot_Kmeans,area)

            