In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string
import joblib
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
msgs = joblib.load("msgs.joblib")
msgs = msgs[["mid", "body"]].reset_index(drop=True)
msgs = msgs.drop_duplicates().reset_index(drop=True)


In [3]:
nlp = spacy.load("en_core_web_sm", disable=["parser","tagger", "ner"])
def cleanup(s):
    rem = ["pmto", "mmbtu", "tw", "st", "isda", "pm", "cc", "hou", "na", "lon", "ferc", "amto", "ena", "eb"]  
    doc = nlp(s.lower())
    return(" ".join([t.lemma_ for t in doc if t.is_alpha 
                     and not(t.lemma_ in rem) 
                     and len(t.lemma_)>2
                     and not(t.is_stop)]))

In [4]:
for i in msgs.index:
    msgs.at[i, "body"] = cleanup(msgs.loc[i]["body"])
    
msgs_len = msgs.body.apply(len)
msgs_len = msgs_len[msgs_len>0]
min_len, max_len = int(msgs_len.quantile(0.05)), int(msgs_len.quantile(0.95))
msgs = msgs.loc[msgs_len[(msgs_len>min_len)&(msgs_len<max_len)].index]

In [5]:
mx_d, mn_d = 0.9, 0.01
vm = CountVectorizer(max_df=mx_d, min_df=mn_d, ngram_range=[1,1])

d_vec = vm.fit_transform(msgs.body.values.tolist())


In [6]:
lda = LatentDirichletAllocation(n_components=20, learning_method="online", evaluate_every=-1, random_state=0)
d_topic = lda.fit_transform(d_vec)

In [7]:
joblib.dump(value=vm, filename="body_cv.joblib")
joblib.dump(value=lda, filename="body_lda.joblib")


['body_lda.joblib']

In [8]:
topics = []
for i in range(lda.components_.shape[0]):
    fnames = np.array(vm.get_feature_names())
    min_, max_ = lda.components_[i,:].min(), lda.components_[i,:].max()
    comp_norm = lda.components_[i,:]
    comp_norm = (comp_norm-min_)/(max_- min_)
    com_idx = np.argsort(lda.components_[i,:])[::-1][0:50]
    topics.append(list(zip(fnames[com_idx], np.round(comp_norm[com_idx], 4), np.round(lda.components_[i,com_idx], 1))))

In [9]:
np.savez_compressed(file="mid_topics.npz", 
                    feature_names=vm.get_feature_names(), 
                    email_topics=d_topic,
                    topics = topics,
                    mid=msgs.mid.values)