<a href="https://colab.research.google.com/github/shreyas-muralidhara/Enron-Email-Priority-Sorting-Response-Recommendation/blob/main/Response_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Response Recommender System for Enron Email Dataset

CSC791 - Natural Language Processing Spring 2020

Author - Shreyas Chikkballapur Muralidhara - schikkb  
          Sharath Narayana - snaraya9

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import xlrd
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import re
import gensim

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC,LinearSVC

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


####Load the dataset for MANN-K profile to dataframe

In [None]:
email_df = pd.read_csv('/content/drive/Shared drives/NLP Project/test.csv',index_col=None,header=0)
email_df.Date_list = email_df.Date_list.str.replace(',','')
print(email_df.shape)
print("The Label Distribution for the Email dataset:\n", email_df['Label'].value_counts())

(9571, 7)
The Label Distribution for the Email dataset:
 thread    4957
reply     4435
delete     179
Name: Label, dtype: int64


### Preprocessing
  * Removing punctutations  
  *  Stemming using porter stemmer

In [None]:
# Perform text Preprocessing - Stemming using Porter Stemmer
preprocessed_Subject, preprocessed_content = [], []

for index, row in email_df.iterrows():
    stem_sub, stem_cont = [], []

    if(not pd.isnull(row['subject_list'])):
      # Removing punctuations from the sentence
      sentence = re.sub(r'[^0-9A-Za-z]+', ' ', row['subject_list'])
      for word in word_tokenize(sentence):
          # Replace the word with stem word 
          stem_word = PorterStemmer().stem(word)
          stem_cont.append(stem_word)
    else:
      stem_cont.append(' ')

    if(not pd.isnull(row['Title'])):
      # Removing punctuations from the sentence
      sentence = re.sub(r'[^0-9A-Za-z]+', ' ', row['Title'])
      for word in word_tokenize(sentence):
          # Replace the word with stem word 
          stem_word = PorterStemmer().stem(word)
          stem_sub.append(stem_word)
      else:
        stem_sub.append(' ')

    preprocessed_Subject.append(' '.join([word for word in stem_sub]))
    preprocessed_content.append(' '.join([word for word in stem_cont]))


email_df['processed_subject'] = preprocessed_Subject
email_df['processed_content'] = preprocessed_content
print(email_df.shape)
print(email_df.columns)

(9571, 9)
Index(['To_list', 'From_list', 'Date_list', 'subject_list', 'Title', 'Label',
       'file', 'processed_subject', 'processed_content'],
      dtype='object')


#### POS tagging
  * Generate POS tags for the **email Subject(preprocessed)** and generate sentence with tags in "word/POStag" format.
  * Generate POS tags for the **email Content(preprocessed)**  and generate sentence with tags in "word/POStag" format.

In [None]:
# Method to compute the POS tags for an input column

def generatePOS(sentence_list):
  sentlist_posTagged = []

  for sentence in sentence_list:
    word_tokens = nltk.word_tokenize(sentence)
    # Get the POS tags for the word tokens
    POS_word_tokens = nltk.pos_tag(word_tokens)
    
    # Concatenate word/pos_tag format
    sent_posTagged = ' '.join([entity[0]+"/"+entity[1] for entity in POS_word_tokens])    
    sentlist_posTagged.append(sent_posTagged)
  
  return pd.DataFrame(sentlist_posTagged)

  # Store the POS tagged Subject and Content into the dataframe

email_df['SubjectPOS'] = generatePOS(email_df['processed_subject'])
email_df['ContentPOS'] = generatePOS(email_df['processed_content'])

print(email_df.shape)
print(email_df.columns)

(9571, 11)
Index(['To_list', 'From_list', 'Date_list', 'subject_list', 'Title', 'Label',
       'file', 'processed_subject', 'processed_content', 'SubjectPOS',
       'ContentPOS'],
      dtype='object')


#### Model 1 - Weighted Doc2Vec Sentence embedding
  1. Generate sentence vectors for **POS tagged Subject** using Doc2Vec - Weight vectors 200
  2. Generate sentence vectors for **POS tagged Content** using Doc2vec - Weight vectors 200
  3. Generate sentence vectors for **From** using Doc2vec - Weight vectors 50
  4. Generate sentence vectors for **To** using Doc2vec - Weight vectors 50
  5. Generate sentence vectors for **Date** using Doc2vec - Weight vectors 50
  6. Concatenate the vectors

In [None]:
# Method to compute the Sentence embeddings for email fields From, To, Date, SubjectPOS and ContentPOS fields and combine them by concateation  
def Doc2Vec_Emailvectors(To_list,From_list, email_date, Subject_list, Content_list):
    # Step 1 - Generating tagged documents with list of words and their associated tags
    df_from_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(From_list)]
    df_to_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(To_list)]
    df_date_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(email_date)]
    df_subject_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(Subject_list)]
    df_content_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(Content_list)]

    # Step 2 - Define the model and build the vocab for the SubjectPOS tags
    d2vmodel = Doc2Vec(min_count =1,vector_size=200, epochs=50)
    d2vmodel.build_vocab(df_subject_tagged)

    df_subject_vectors = []
    for d in df_subject_tagged:  
        df_subject_vectors.append(d2vmodel.infer_vector(d.words))
    df_subject_vectors = pd.DataFrame(df_subject_vectors)

    # Step 3 - Define the model and build the vocab for the contentPOS tags
    d2vmodel = Doc2Vec(min_count =1,vector_size=200, epochs=50)
    d2vmodel.build_vocab(df_content_tagged)
    df_content_vectors = []
    for d in df_content_tagged:  
        df_content_vectors.append(d2vmodel.infer_vector(d.words))
    df_content_vectors = pd.DataFrame(df_content_vectors)

    # Concatenate the 2 feature vectors into single feature vector
    df_subjectContent_vectors = np.concatenate((df_subject_vectors, df_content_vectors), axis=1)

    # Step 3 - Define the model and build the vocab for the From tags
    d2vmodel = Doc2Vec(min_count =1,vector_size=50, epochs=50)
    d2vmodel.build_vocab(df_from_tagged)
    df_from_vectors = []
    for d in df_from_tagged:  
        df_from_vectors.append(d2vmodel.infer_vector(d.words))
    df_from_vectors = pd.DataFrame(df_from_vectors)

    # Step 4 - Define the model and build the vocab for the To tags
    d2vmodel = Doc2Vec(min_count =1,vector_size=50, epochs=50)
    d2vmodel.build_vocab(df_to_tagged)
    df_to_vectors = []
    for d in df_to_tagged:  
        df_to_vectors.append(d2vmodel.infer_vector(d.words))
    df_to_vectors = pd.DataFrame(df_to_vectors)

    # Concatenate the 2 feature vectors into single feature vector
    df_fromTo_vectors = np.concatenate((df_from_vectors, df_to_vectors), axis=1)

    df_fromToContent_vectors = np.concatenate((df_subjectContent_vectors,df_fromTo_vectors), axis=1)

     # Step 5 - Define the model and build the vocab for the Date tags
    d2vmodel = Doc2Vec(min_count =1,vector_size=50, epochs=50)
    d2vmodel.build_vocab(df_date_tagged)
    df_date_vectors = []
    for d in df_date_tagged:  
        df_date_vectors.append(d2vmodel.infer_vector(d.words))
    df_date_vectors = pd.DataFrame(df_date_vectors)

    df_EmailContent_vectors = np.concatenate((df_fromToContent_vectors,df_date_vectors), axis=1)
    print(df_EmailContent_vectors.shape)
    
    return pd.DataFrame(df_EmailContent_vectors)

# Generate Concatenated vectors for Train dataset  
df_train_emailCombined_vectors = Doc2Vec_Emailvectors(email_df['To_list'], email_df['From_list'], email_df['Date_list'], email_df['SubjectPOS'], email_df['ContentPOS'])

#Split the input data into Training and validation sets
X_train, X_test, Y_train, Y_test = train_test_split(df_train_emailCombined_vectors, email_df['Label'], test_size=0.25, shuffle = True, stratify = email_df['Label'], random_state=0)
print(X_train.shape, X_test.shape,Y_train.shape, Y_test.shape)

print("The Label Distribution for the Training dataset:\n", Y_train.value_counts())

(9571, 550)
(7178, 550) (2393, 550) (7178,) (2393,)
The Label Distribution for the Training dataset:
 thread    3718
reply     3326
delete     134
Name: Label, dtype: int64


### Model 1 - Train and Test the Doc2Vec generated vectors using SVM
  Ground truth measured using   
  * Accuracy  
  * F1 Score  
  * Classification Metrics  



In [None]:
# Fit the model for the classifier
clf_doc2vec = SVC(max_iter =10000, C=1).fit(X_train, Y_train)

# Predicting the class labels for Test data
Y_test_pred = clf_doc2vec.predict(X_test)

print('Classification Model 1 - SVM - Doc2Vec TEST metrics:\nAccuracy -',round(accuracy_score(Y_test,Y_test_pred),4))
print('f1 score -', round(f1_score(Y_test,Y_test_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(Y_test, Y_test_pred))

Classification Model 1 - SVM - Doc2Vec TEST metrics:
Accuracy - 0.6849
f1 score - 0.6623
Classification Report:
               precision    recall  f1-score   support

      delete       1.00      0.04      0.09        45
       reply       0.61      0.96      0.74      1109
      thread       0.89      0.46      0.61      1239

    accuracy                           0.68      2393
   macro avg       0.83      0.49      0.48      2393
weighted avg       0.76      0.68      0.66      2393



### Model 2 - Tf-Idf (Term frequency - Inverse Document frequency)

#### Step 1 - Convert the collection of email corpus to matrix of token counts using Count Vectorizer

#### Step 2 - Transform the count matrix into normalized TF or TF-IDF form.

#### Step 3 - Train and test using the classifier for each sentence

In [None]:
# Split the sentences into training and validation by stratifying the samples.
email_df['SubjectPOS'] = [''.join([word for word in subject]) for subject in email_df['SubjectPOS']]
email_df['ContentPOS'] = [''.join([word for word in content]) for content in email_df['ContentPOS']]

email_df['Merged_email'] = email_df[['To_list','From_list','Date_list','SubjectPOS','ContentPOS']].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(email_df['Merged_email'], email_df['Label'], test_size=0.25, shuffle = True, stratify = email_df['Label'], random_state=0)
#print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
print("The Label Distribution for the Training dataset:\n", Y_train.value_counts())

# Step 1 - Generate the token matrix
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

# Step 2 - Transform the count matrix to TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

#print(X_train_tfidf.shape, X_test_tfidf.shape,Y_train.shape, Y_test.shape)

The Label Distribution for the Training dataset:
 thread    3718
reply     3326
delete     134
Name: Label, dtype: int64


### Model 2 - Train and Test the TfIdf generated frequency corpus using Naive 
  Ground truth measured using   
  * Accuracy  
  * F1 Score  
  * Classification Metrics  

In [None]:
# Fit the model for the classifier
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

# Predicting the class labels for Test data
Y_test_pred = clf.predict(X_test_tfidf)

print('Classification Model 2 - K- Neighbors classifier Classifier Tf-Idf TEST metrics:\nAccuracy -',round(accuracy_score(Y_test,Y_test_pred),4))
print('f1 score -', round(f1_score(Y_test,Y_test_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(Y_test, Y_test_pred))

Classification Model 2 - K- Neighbors classifier Classifier Tf-Idf TEST metrics:
Accuracy - 0.5357
f1 score - 0.5148
Classification Report:
               precision    recall  f1-score   support

      delete       0.00      0.00      0.00        45
       reply       0.52      0.35      0.42      1109
      thread       0.54      0.72      0.62      1239

    accuracy                           0.54      2393
   macro avg       0.35      0.36      0.35      2393
weighted avg       0.52      0.54      0.51      2393



  _warn_prf(average, modifier, msg_start, len(result))
