## This model is to assign a TOPIC to a given Email
## Two steps:
### 1. unsupervised learning -Topic modeling to label each email with a topic ( use LDA).
### 2. supervised learning - Classification model trained on the labeled data from the step 1(Xgboost)

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import re

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [32]:
# import nltk to clean the emails
from nltk.corpus import stopwords
from nltk import pos_tag
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'gov','com'])
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sudaw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sudaw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [4]:
path = '/Funnel Leasing/emails/emails/'
# path_demo = '/Funnel Leasing/emails/emails/00001'

In [5]:
import email

### take a look at an email

In [6]:
# with open(path_demo,'r') as f:
#     a = email.message_from_file(f)

In [7]:
# print(a)

In [8]:
# body = a.get_payload().replace('\n', '') 

In [9]:
# print(body)

In [10]:
# remove the signiture

# clean_body=re.sub(r'(\w*\s)?([B|b]est|[R|r]egards|Have a|[C|c]heers|[S|s]incerely|[T|t]ake care|Looking forward|Fond|Kind|Yours|----.*----)(\s*.*)',"", body)

In [11]:
# clean_body

## STEP 1. TOPIC MODELING

In [12]:
#load all emails and extract the email body
import os
corpus = []
for filename in os.listdir(path):
    with open(os.path.join(path, filename), 'r') as f:
        email_raw = email.message_from_file(f) # load the email
        body = email_raw.get_payload().replace('\n', '') # extract the email body
        body =re.sub(r'(\w*\s)?([B|b]est|[R|r]egards|\
                            Have a|[C|c]heers|[S|s]incerely|[T|t]ake care|\
                            Looking forward|Fond|Kind|Yours|----.*----)(\s*.*)',"", body) # remove the email signiture
        corpus.append(body)


In [13]:
corpus[1]

'Mark A. Cartwright (markc@emx.utexas.edu) wrote:: Well,: : 42 is 101010 binary, and who would forget that its the: answer to the Question of "Life, the Universe, and Everything else.": That is to quote Douglas Adams in a round about way.: : Of course the Question has not yet been discovered...^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^But it WAS discovered (sort of).  The question was "What is 7 times 8?"When Arthur Dent objected that this was, unfortunately, factuallyinaccurate, the effort to discover the question was begun all over.This last effort was, I believe, likely to take far longer thanthe lifespan of the universe, in fact several lifespans of same!'

In [40]:
# a function for Filter for only nouns
def noun_only(x):
    pos_comment = nltk.pos_tag(x)
    filtered =[word[0] for word in pos_comment if word[1] in ['NN']]
    return filtered

In [41]:
# function to clean the text and lemmatize the words
def stem_tokenizer(text):
    clean_corpus = []
    for sentence in text:
        lemmatizer = WordNetLemmatizer()       
        sentence = re.sub(r"[^A-Za-z]", " ", sentence)# remove everthing expect the words
        words = re.sub(r'(^| ).( |$)',' ',sentence) # remove single character word
        words = re.sub(r'(^| ).( |$)',' ',words).lower().split() # remove the remaining single charater
        words = [word for word in words if word not in stop_words] # remove stopwords
        words = [lemmatizer.lemmatize(word) for word in words] # lemmatize the words
        words = noun_only(words) # keep only the nouns
        clean_corpus.append(words)
    return clean_corpus

In [42]:
# clean and lemmatize the text
corpus_clean = stem_tokenizer(corpus)

In [44]:
# import Gensim for LDA
import gensim
import gensim.corpora as corpora


In [49]:
# Create Dictionary for LDA
id2word = corpora.Dictionary(corpus_clean)
id2word.filter_extremes(no_below=10, no_above=0.4)

In [50]:
#create a mapping for each work and its frequency in the docments.
mapping = [id2word.doc2bow(sentence) for sentence in corpus_clean]

In [53]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=mapping,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=2022,
                                           update_every=1,
                                           chunksize=100,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=True)
#lda_model.save('lda_model.model')

In [54]:
from pprint import pprint
pprint(lda_model.print_topics())

[(0,
  '0.012*"time" + 0.010*"thing" + 0.010*"way" + 0.008*"question" + 0.008*"day" '
  '+ 0.007*"car" + 0.006*"apr" + 0.006*"god" + 0.006*"world" + 0.006*"fact"'),
 (1,
  '0.021*"window" + 0.019*"ax" + 0.014*"problem" + 0.012*"file" + '
  '0.011*"drive" + 0.010*"card" + 0.010*"program" + 0.009*"anyone" + '
  '0.009*"work" + 0.009*"system"'),
 (2,
  '0.013*"system" + 0.013*"space" + 0.008*"number" + 0.008*"information" + '
  '0.008*"government" + 0.007*"chip" + 0.007*"phone" + 0.006*"mail" + '
  '0.006*"work" + 0.006*"list"'),
 (3,
  '0.018*"year" + 0.016*"game" + 0.014*"team" + 0.010*"player" + 0.007*"time" '
  '+ 0.007*"season" + 0.006*"apr" + 0.006*"city" + 0.005*"play" + 0.005*"jew"')]


## topic 0: "TIME"; topic 1: "COMPUTER"; topic 2: "INFORMATION"; topic 3: "SPORTS"

In [62]:
topics = {0: "TIME", 1: "COMPUTER", 2: "INFORMATION", 3: "SPORTS"}

In [55]:
import numpy as np
lda_topics = pd.DataFrame(lda_model.get_document_topics(mapping, minimum_probability=0.0))

In [77]:
corpus_topics = pd.DataFrame(corpus,columns = ['email'])

In [78]:
corpus_topics['topic'] = lda_topics.apply(lambda x: np.array([s[1] for s in x]).argmax(), axis=1)

In [95]:
print(corpus_topics.head())

                                               email  topic
0  Hi ... Recently I found XV for MS-DOS in a sub...      1
1  Mark A. Cartwright (markc@emx.utexas.edu) wrot...      0
2  > Problem 1> > My father told me the following...      3
3  I  have one of the original Powerbook 170's (w...      1
4          My family has never been particularly ...      0


In [80]:
corpus_topics.to_csv('corpus_topics.csv')

## STEP 2. CLASSIFICAITON MODEL

In [81]:
X = corpus_topics['email'] 
y = corpus_topics['topic']

In [82]:
def tokenizer(text):
    
    lemmatizer = WordNetLemmatizer()       
    sentence = re.sub(r"[^A-Za-z]", " ", text)# remove everthing expect the words
    words = re.sub(r'(^| ).( |$)',' ',sentence) # remove single character word
    words = re.sub(r'(^| ).( |$)',' ',words).lower().split() # remove the remaining single charater
    words = [word for word in words if word not in stop_words] # remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words] # lemmatize the words
    words = noun_only(words) # keep only the nouns
    return words

In [83]:
# Use tfidf to account for the significance of each word in the document
tfidf = TfidfVectorizer(stop_words=stop_words,
                        tokenizer=tokenizer,
                        lowercase=True,
                        max_df=0.5,
                        min_df=5
                       )

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022)

In [85]:
from xgboost import XGBRegressor

model_xgb = Pipeline([('tfidf', tfidf),
                     ('xgb', XGBRegressor(objective='multi:softmax', n_estimators=500,num_class=4,
                        learning_rate = 0.1, max_depth=4))])
model_xgb.fit(X_train,y_train)




Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.5, min_df=5,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function tokenizer at 0x00000222F4FA2D30...
                              interaction_constraints='', learning_rate=0.1,
                              max_delta_step=0, max_depth=4, min_child_weight=1,
                              missing=nan, monotone_c

In [87]:

print("training classification_report")
print(classification_report(y_train, model_xgb.predict(X_train)))
print('-----------------------------------------------------------------')
print("testing classification_report")
print(classification_report(y_test, model_xgb.predict(X_test)))

training classification_report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      3188
           1       0.98      0.95      0.96      1956
           2       0.97      0.92      0.94      1714
           3       0.98      0.90      0.94      1061

    accuracy                           0.95      7919
   macro avg       0.96      0.94      0.95      7919
weighted avg       0.95      0.95      0.95      7919

-----------------------------------------------------------------
testing classification_report
              precision    recall  f1-score   support

           0       0.76      0.91      0.83      1364
           1       0.86      0.84      0.85       787
           2       0.81      0.65      0.72       747
           3       0.89      0.69      0.78       496

    accuracy                           0.81      3394
   macro avg       0.83      0.77      0.79      3394
weighted avg       0.81      0.81      0.80      3394



In [88]:
import pickle



In [89]:
pickle.dump(model_xgb, open('model_xgb.pkl', 'wb'))