In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


**User Email Classification using Machine Learning Techniques**

In [1]:
#Importing Dependencies
import numpy as np
import pandas as pd
import re
import random
import email
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics 
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
#Reading the dataset
dataset = pd.read_csv('/content/drive/My Drive/ALDA_Project/emails.csv')

In [5]:
#Printing out the dataset 
dataset.head(10)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
5,allen-p/_sent_mail/1002.,Message-ID: <30965995.1075863688265.JavaMail.e...
6,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...
7,allen-p/_sent_mail/1004.,Message-ID: <17189699.1075863688308.JavaMail.e...
8,allen-p/_sent_mail/101.,Message-ID: <20641191.1075855687472.JavaMail.e...
9,allen-p/_sent_mail/102.,Message-ID: <30795301.1075855687494.JavaMail.e...


In [6]:
#Preprocessig dataset
dataset_sent_mails = dataset[dataset['file'].str.contains('sent')]
print(dataset_sent_mails.head())

                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [7]:
#Selecting top 15 users who has most number of sent emails
dataset_sent_mails = dataset_sent_mails.assign(sender=dataset_sent_mails["file"].map(lambda x: re.search("(.*)/.*sent", x).group(1)).values)
dataset_sent_mails.drop("file", axis=1, inplace=True)
print(dataset_sent_mails["sender"].value_counts().head(15))


mann-k           8926
kaminski-v       8644
dasovich-j       5366
germany-c        5128
shackleton-s     4407
jones-t          4123
bass-e           3030
lenhart-m        2759
beck-s           2674
symes-k          2649
scott-s          2602
taylor-m         2409
love-p           2371
arnold-j         2353
perlingiere-d    2352
Name: sender, dtype: int64


In [8]:
users = dataset_sent_mails["sender"].value_counts().head(15).index.values
mapping = {}
for i, user in enumerate(users, start = 1):
  mapping[user] = i
sent_user_dataset = dataset_sent_mails[dataset_sent_mails.sender.isin(users)]
print(sent_user_dataset.shape)

(59793, 2)


In [0]:
#Function for preprocessing emails for creating a proper structured dataset 
def email_preprocessing(email_message):
    msg = email.message_from_string(email_message)
    
    email_content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            email_content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(email_content)
    # msg["content"] = ''.join(email_content)
    return result

#Function for preprocessing of text data
def content_preprocessing(content):
    content = re.sub("[^a-zA-Z]"," ", content)
    words = content.lower().split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]

    return ' '.join(words)

In [10]:
final_data = pd.DataFrame(list(map(email_preprocessing, sent_user_dataset.message)))
final_data.head(5)

Unnamed: 0,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,Cc,Bcc
0,<33025919.1075857594206.JavaMail.evans@thyme>,"Wed, 13 Dec 2000 13:09:00 -0800 (PST)",john.arnold@enron.com,slafontaine@globalp.com,re:spreads,1.0,text/plain; charset=us-ascii,7bit,John Arnold,slafontaine@globalp.com @ ENRON,,,\John_Arnold_Dec2000\Notes Folders\'sent mail,Arnold-J,Jarnold.nsf,saw a lot of the bulls sell summer against len...,,
1,<19235579.1075857594400.JavaMail.evans@thyme>,"Mon, 11 Dec 2000 08:51:00 -0800 (PST)",john.arnold@enron.com,slafontaine@globalp.com,re:summer inverses,1.0,text/plain; charset=us-ascii,7bit,John Arnold,slafontaine@globalp.com @ ENRON,,,\John_Arnold_Dec2000\Notes Folders\'sent mail,Arnold-J,Jarnold.nsf,amazing how with cash futures at $1 and the ba...,,
2,<19835539.1075857596349.JavaMail.evans@thyme>,"Tue, 17 Oct 2000 11:56:00 -0700 (PDT)",john.arnold@enron.com,jennifer.fraser@enron.com,Re: congrats,1.0,text/plain; charset=us-ascii,7bit,John Arnold,Jennifer Fraser,,,\John_Arnold_Dec2000\Notes Folders\'sent mail,Arnold-J,Jarnold.nsf,We both thank you\n\n\n \n\t\n\t\n\tFrom: J...,,
3,<12626409.1075857596370.JavaMail.evans@thyme>,"Tue, 17 Oct 2000 10:36:00 -0700 (PDT)",john.arnold@enron.com,jenwhite7@zdnetonebox.com,Re: Hi,1.0,text/plain; charset=us-ascii,7bit,John Arnold,"""Jennifer White"" <jenwhite7@zdnetonebox.com> @...",,,\John_Arnold_Dec2000\Notes Folders\'sent mail,Arnold-J,Jarnold.nsf,"So, what is it? And by the way, don't start ...",,
4,<13844738.1075857596392.JavaMail.evans@thyme>,"Tue, 17 Oct 2000 10:33:00 -0700 (PDT)",john.arnold@enron.com,msagel@home.com,Re: Thursday meeting,1.0,text/plain; charset=us-ascii,7bit,John Arnold,"""Mark Sagel"" <msagel@home.com> @ ENRON",,,\John_Arnold_Dec2000\Notes Folders\'sent mail,Arnold-J,Jarnold.nsf,"sure, stop by and we'll arrange a place to mee...",,


In [11]:
#We mainly need the content of the emails for classification. It can be scaled to include other features as well
final_data = pd.DataFrame(list(map(content_preprocessing, final_data[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))), columns = ["content"])
# final_data.head()
final_data = final_data.assign(user_number= sent_user_dataset["sender"].values)
final_data = final_data.replace({'user_number': mapping})
final_data.head()

Unnamed: 0,content,user_number
0,spreads saw lot bulls sell summer length front...,14
1,summer inverses amazing cash futures back piec...,14
2,congrats thank jennifer fraser pm john arnold ...,14
3,hi way start excuses expected full gourmet coo...,14
4,thursday meeting sure stop arrange place meet ...,14


In [0]:
#Splitting the data into training and testing
X = final_data.content.values
y = final_data.user_number.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
#Processing the text data and creating TFId vector of each text
vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [17]:
#Performing feature selection and finding out the best model for our classfication task
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
X_train_new = model.transform(X_train)
classifier_str = ["LinearSVC", "SGDClassifier"]
classifier_main = [LinearSVC, SGDClassifier]
for classifier_str,classifier in zip(classifier_str, classifier_main):
    print("Classifier: " + str(classifier_str) +",  Cross validation Accuracy: " + str(cross_val_score(classifier(), X_train_new, y_train, cv=3).mean()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Classifier: LinearSVC,  Cross validation Accuracy: 0.9451853935728116
Classifier: SGDClassifier,  Cross validation Accuracy: 0.938035693653616


In [0]:
#Mapping for preprocessing all users

# users = dataset_sent_mails["sender"].value_counts().index.values
# mapping = {}
# for i, user in enumerate(users, start = 1):
  
#   mapping[user] = i
# sent_user_dataset = dataset_sent_mails
# final_data = pd.DataFrame(list(map(email_preprocessing, sent_user_dataset.message)))

In [0]:
# final_data = pd.DataFrame(list(map(content_preprocessing, final_data[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))), columns = ["content"])
# # final_data.head()
# final_data = final_data.assign(user_number= sent_user_dataset["sender"].values)
# final_data = final_data.replace({'user_number': mapping})
# final_data.head()

In [0]:
# X = final_data.content.values
# y = final_data.user_number.values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)
# X_train = vect.fit_transform(X_train)
# X_test = vect.transform(X_test)
# clf = LogisticRegression()
# clf.fit(X_train, y_train)
# model = SelectFromModel(clf, prefit=True)
# X_train_new = model.transform(X_train)
# for classifier in [LinearSVC, SGDClassifier, RandomForestClassifier]:
#     print(cross_val_score(classifier(), X_train_new, y_train, cv=3).mean())

In [21]:
#Testing the best on test data
classifier = LinearSVC()
classifier.fit(X_train_new, y_train)
X_test_ = model.transform(X_test)
y_predicted = classifier.predict(X_test_)
print("Accuracy: ",metrics.accuracy_score(y_test, y_predicted))

Accuracy:  0.9572706748055857


In [23]:
#Experimenting on truncatedSVD for feature selection
tsvd = TruncatedSVD(n_components = 120)
X_train_pca = tsvd.fit_transform(X_train)
X_test_pca = tsvd.transform(X_test)

clf = LinearSVC()
clf.fit(X_train_pca, y_train)

print("Accuracy: ",metrics.accuracy_score(y_test, clf.predict(X_test_pca)))

Accuracy:  0.9029183042060372
