# NLP Project - E-mail categorization

**Authors:**

Adela Krylova (a.krylova@innopolis.university),

Roman Makarov (o.makarov@innopolis.university)


In [1]:
import string
import nltk
import pandas as pd
import os
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

#Creating big dataset

In [2]:
!gdown 1OwDvw2LMMINx0RL3bUWaGgKg-Cl2nOhe

Downloading...
From: https://drive.google.com/uc?id=1OwDvw2LMMINx0RL3bUWaGgKg-Cl2nOhe
To: /content/emails.zip
  0% 0.00/1.95M [00:00<?, ?B/s]100% 1.95M/1.95M [00:00<00:00, 142MB/s]


In [3]:
!unzip emails.zip

Archive:  emails.zip
  inflating: spam_ham_dataset.csv    


In [4]:
!gdown 1qhma_k8Mr7QRcj9AWBRu1ZyNV3Scrd2F

Downloading...
From: https://drive.google.com/uc?id=1qhma_k8Mr7QRcj9AWBRu1ZyNV3Scrd2F
To: /content/calendar.zip
  0% 0.00/257k [00:00<?, ?B/s]100% 257k/257k [00:00<00:00, 99.8MB/s]


In [5]:
%%capture
!unzip calendar

In [6]:
"""
    Labels:

    ham:      0
    spam:     1
    calendar: 2
    meeting:  3
    working:  4

"""

'\n    Labels:\n\n    ham:      0\n    spam:     1\n    calendar: 2\n    meeting:  3\n    working:  4\n\n'

In [7]:
new_df = []

In [8]:
cal_dir = 'dataset/calendar'

for filename in os.listdir(cal_dir):
    if filename == 'desktop.ini':
        continue
    
    f = os.path.join(cal_dir, filename)
    new_df.append({'text': ' '.join(open(f, 'r').readlines()[15:]), 'label_num': 2})


meet_dir = 'dataset/meetings'

for filename in os.listdir(meet_dir):
    f = os.path.join(meet_dir, filename)
    new_df.append({'text': ' '.join(open(f, 'r').readlines()[15:]), 'label_num': 3})

In [9]:
new_df = pd.DataFrame(new_df)
new_df.head()

Unnamed: 0,text,label_num
0,Set up with Patti x-39106 & Brenda x-31914,2
1,\n Shenton Cell: 925/963-3385\n Office: 925-39...,2
2,617.306.8084,2
3,\n You can reach Andrew @ 212-318-2295. Also s...,2
4,this guy works for Dimichele i think?,2


In [10]:
messages = pd.read_csv("spam_ham_dataset.csv", encoding="latin-1")
messages = messages.drop(['Unnamed: 0', 'label'], axis=1)

new_df = pd.concat([messages, new_df])

new_df.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [11]:
!gdown 1Ej9AAef7xLi5xhMo5BN-qPQGeYHwTZ6O

Downloading...
From: https://drive.google.com/uc?id=1Ej9AAef7xLi5xhMo5BN-qPQGeYHwTZ6O
To: /content/working.zip
100% 375M/375M [00:11<00:00, 32.2MB/s]


In [12]:
!unzip working

Archive:  working.zip
  inflating: emails.csv              


In [13]:
messages = pd.read_csv("emails.csv")

In [14]:
messages = messages.drop(['file'], axis=1)

In [15]:
messages.head()

Unnamed: 0,message
0,Message-ID: <18782981.1075855378110.JavaMail.e...
1,Message-ID: <15464986.1075855378456.JavaMail.e...
2,Message-ID: <24216240.1075855687451.JavaMail.e...
3,Message-ID: <13505866.1075863688222.JavaMail.e...
4,Message-ID: <30922949.1075863688243.JavaMail.e...


In [16]:
print(messages['message'].iloc[1])

Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>
Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)
From: phillip.allen@enron.com
To: john.lavorato@enron.com
Subject: Re:
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the

In [17]:
new_data = []
count = 1000

for f in messages.iterrows():
    if count == 0:
        break
    
    new_data.append({'text': ' '.join(f[1].message.split('\n')[15:]), 'label_num': 4})
    count -= 1

In [18]:
print(new_data[1])

{'text': " Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.  As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.    My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time. ", 'label_num': 4}


In [19]:
new_df = pd.concat([pd.DataFrame(new_data), new_df])
new_df.head()

Unnamed: 0,text,label_num
0,Here is our forecast,4
1,Traveling to have a business meeting takes th...,4
2,test successful. way to go!!!,4
3,"Randy, Can you send me a schedule of the sa...",4
4,Let's shoot for Tuesday at 11:45.,4


In [20]:
messages = pd.read_csv("spam_ham_dataset.csv", encoding="latin-1")

messages = messages.drop(['Unnamed: 0', 'label'], axis=1)

messages = pd.concat([new_df, messages])
messages.head()

Unnamed: 0,text,label_num
0,Here is our forecast,4
1,Traveling to have a business meeting takes th...,4
2,test successful. way to go!!!,4
3,"Randy, Can you send me a schedule of the sa...",4
4,Let's shoot for Tuesday at 11:45.,4


In [21]:
messages['label_num'].value_counts()

0    7344
1    2998
4    1000
3     345
2     128
Name: label_num, dtype: int64

# Preparing NLP model

In [22]:
nltk.download('stopwords')
nltk.download('punkt')

def text_preprocess(message):
    # Remove punctuation
    nopunc = [char for char in message if char not in string.punctuation]

    # Join the characters again
    nopunc = "".join(nopunc)
    nopunc = nopunc.lower()

    # Remove any stopwords and non-alphabetic characters
    nostop = [
        word
        for word in nopunc.split()
        if word.lower() not in stopwords.words("english") and word.isalpha()
    ]

    return nostop

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [23]:
messages["text"] = messages["text"].apply(text_preprocess)
messages["text"] = messages["text"].agg(lambda x: " ".join(map(str, x)))

messages.head()

Unnamed: 0,text,label_num
0,forecast,4
1,traveling business meeting takes fun trip espe...,4
2,test successful way go,4
3,randy send schedule salary level everyone sche...,4
4,lets shoot tuesday,4


In [24]:
vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(messages["text"])

messages_bow = bow_transformer.transform(messages["text"])
print(f"Shape of sparse matrix: {messages_bow.shape}")

Shape of sparse matrix: (11815, 49302)


In [25]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)

messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(11815, 49302)


In [26]:
msg_train, msg_test, label_train, label_test = train_test_split(
    messages_tfidf, messages['label_num'], test_size=0.2, stratify=messages['label_num'], 
    random_state=42
)

#Training and testing NLP model

In [27]:
from sklearn.svm import SVC

clf = SVC(C=10000, kernel='rbf')

clf.fit(msg_train, label_train)

In [28]:
predict_test = clf.predict(msg_test)

print(f"Accuracy: {metrics.accuracy_score(label_test, predict_test):0.3f}")

Accuracy: 0.981


#Saving the model

In [29]:
import pickle

filename = "email_categorizer.pickle"

pickle.dump(clf, open(filename, "wb"))

#Loading the model and using it for an example

In [39]:
clf = pickle.load(open(filename, "rb"))

In [None]:
"""
    Labels:

    ham:      0
    spam:     1
    calendar: 2
    meeting:  3
    working:  4

"""

In [40]:
def test_one(sentences, clf_):
    msg_test = pd.DataFrame(sentences, columns=['var1'])
    
    msg_test["var1"] = msg_test["var1"].apply(text_preprocess)
    msg_test["var1"] = msg_test["var1"].agg(lambda x: " ".join(map(str, x)))

    prediction = clf_.predict(
        tfidf_transformer.transform(bow_transformer.transform(msg_test["var1"].values))
    )

    return ['spam' if x == 1 else 'ham' if x == 0 else 'calendar' if x == 2 else 'meeting' if x == 3 else 'working' for x in prediction]

In [38]:
import random

example = messages.iloc[random.randint(0, len(messages) - 1)]
example.text, example.label_num

('subject mtbe upsets mtbe plant running production tuesday may excess mb', 0)

In [42]:
test_one([example.text], clf)

['ham']