In [0]:
from google.colab import drive
drive.mount('/content/drive')

**User Email Classification using Machine Learning Techniques**

In [0]:
#Importing Dependencies
import numpy as np
import pandas as pd
import re
import random
import email
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics 
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [0]:
#Reading the dataset
dataset = pd.read_csv('/content/drive/My Drive/ALDA_Project/emails.csv')

In [0]:
#Printing out the dataset 
dataset.head(10)

In [0]:
#Preprocessig dataset
dataset_sent_mails = dataset[dataset['file'].str.contains('sent')]
print(dataset_sent_mails.head())

In [0]:
#Selecting top 15 users who has most number of sent emails
dataset_sent_mails = dataset_sent_mails.assign(sender=dataset_sent_mails["file"].map(lambda x: re.search("(.*)/.*sent", x).group(1)).values)
dataset_sent_mails.drop("file", axis=1, inplace=True)
print(dataset_sent_mails["sender"].value_counts().head(15))
x = dataset_sent_mails["sender"].value_counts().head(15)

In [0]:
temp = {}
for i in dataset_sent_mails['sender']:
  temp[i]=0
# temp = temp.unique()

In [0]:
for i in dataset_sent_mails['sender']:
  temp[i] += 1
new_list = sorted(list(zip(list(temp.values()), list(temp.keys()))), reverse = True)[:15]
num, mailer = zip(*new_list) 
mail = range(len(new_list))
plt.bar(mail, num, align = 'center', color ='blue', alpha=0.8)
plt.xticks(mail, mailer, rotation='vertical')
plt.show()


In [0]:
users = dataset_sent_mails["sender"].value_counts().head(15).index.values
mapping = {}
for i, user in enumerate(users, start = 1):
  mapping[user] = i
sent_user_dataset = dataset_sent_mails[dataset_sent_mails.sender.isin(users)]
print(sent_user_dataset.shape)

In [0]:
#Function for preprocessing emails for creating a proper structured dataset 
def email_preprocessing(email_message):
    msg = email.message_from_string(email_message)
    
    email_content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            email_content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(email_content)
    # msg["content"] = ''.join(email_content)
    return result

#Function for preprocessing of text data
def content_preprocessing(content):
    content = re.sub("[^a-zA-Z]"," ", content)
    words = content.lower().split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]

    return ' '.join(words)

In [0]:
final_data = pd.DataFrame(list(map(email_preprocessing, sent_user_dataset.message)))
final_data.head(5)

In [0]:
#We mainly need the content of the emails for classification. It can be scaled to include other features as well
final_data = pd.DataFrame(list(map(content_preprocessing, final_data[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))), columns = ["content"])
# final_data.head()
final_data = final_data.assign(user_number= sent_user_dataset["sender"].values)
final_data = final_data.replace({'user_number': mapping})
final_data.head()

In [0]:
#Splitting the data into training and testing
X = final_data.content.values
y = final_data.user_number.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
#Processing the text data and creating TFId vector of each text
vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [0]:
#Performing feature selection and finding out the best model for our classfication task
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
X_train_new = model.transform(X_train)
classifier_str = ["LinearSVC", "SGDClassifier"]
classifier_main = [LinearSVC, SGDClassifier]
for classifier_str,classifier in zip(classifier_str, classifier_main):
    print("Classifier: " + str(classifier_str) +",  Cross validation Accuracy: " + str(cross_val_score(classifier(), X_train_new, y_train, cv=3).mean()))

In [0]:
#Mapping for preprocessing all users

users = dataset_sent_mails["sender"].value_counts().index.values
mapping = {}
for i, user in enumerate(users, start = 1):
  
  mapping[user] = i
sent_user_dataset = dataset_sent_mails
final_data = pd.DataFrame(list(map(email_preprocessing, sent_user_dataset.message)))

In [0]:
final_data = pd.DataFrame(list(map(content_preprocessing, final_data[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))), columns = ["content"])
# final_data.head()
final_data = final_data.assign(user_number= sent_user_dataset["sender"].values)
final_data = final_data.replace({'user_number': mapping})
final_data.head()

In [0]:
X = final_data.content.values
y = final_data.user_number.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vect = TfidfVectorizer(ngram_range = (2,3),sublinear_tf=True, use_idf=True)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
X_train_new = model.transform(X_train)
for classifier in [LinearSVC, SGDClassifier, RandomForestClassifier, MultinomialNB]:
    print(cross_val_score(classifier(), X_train_new, y_train, cv=3).mean())

In [0]:
#Testing the best on test data
X = final_data.content.values
y = final_data.user_number.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
X_train_new = model.transform(X_train)
classifier = LinearSVC(C = 1, max_iter=500)
classifier.fit(X_train_new, y_train)
X_test_ = model.transform(X_test)
y_predicted = classifier.predict(X_test_)
print("Accuracy: ",metrics.accuracy_score(y_test, y_predicted))

In [0]:
#Testing the best on test data
classifier = SGDClassifier()
classifier.fit(X_train_new, y_train)
X_test_ = model.transform(X_test)
y_predicted = classifier.predict(X_test_)
print("Accuracy: ",metrics.accuracy_score(y_test, y_predicted))

In [0]:
classifier = RandomForestClassifier()
classifier.fit(X_train_new, y_train)
X_test_ = model.transform(X_test)
y_predicted = classifier.predict(X_test_)
print("Accuracy: ",metrics.accuracy_score(y_test, y_predicted))

In [0]:
classifier = MultinomialNB()
classifier.fit(X_train_new, y_train)
X_test_ = model.transform(X_test)
y_predicted = classifier.predict(X_test_)
print("Accuracy: ",metrics.accuracy_score(y_test, y_predicted))

In [0]:
#Experimenting on truncatedSVD for feature selection
tsvd = TruncatedSVD(n_components = 120)
X_train_pca = tsvd.fit_transform(X_train)
X_test_pca = tsvd.transform(X_test)

clf = LinearSVC()
clf.fit(X_train_pca, y_train)

print("Accuracy: ",metrics.accuracy_score(y_test, clf.predict(X_test_pca)))

In [0]:
count_vect = CountVectorizer()
X = final_data.content.values
y = final_data.user_number.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# vect = TfidfVectorizer(ngram_range = (2,3),sublinear_tf=True, use_idf=True)
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
X_train_new = model.transform(X_train)
for classifier in [LinearSVC, SGDClassifier, RandomForestClassifier, MultinomialNB]:
    print(cross_val_score(classifier(), X_train_new, y_train, cv=3).mean())

In [0]:
#Performing Grid Search
from sklearn.model_selection import GridSearchCV
X = final_data.content.values
y = final_data.user_number.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
X_train_new = model.transform(X_train)
# X_train_pca = tsvd.fit_transform(X_train)

parameters = {'C':[0.1, 0.3, 0.5, 1], 'max_iter':[500,1000,2000,3000],'penalty':['l1','l2']}
clf = GridSearchCV(LinearSVC(), parameters, cv = 3)
clf.fit(X_train_new, y_train)

# clf.grid_scores_

In [0]:
clf.best_score_

In [0]:
clf.cv_results_