### load the required packages

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
stop_words = stopwords.words('english')


#### data collection

In [49]:
# function to load emails from a directory
def load_file_from_directort(directory):
    import os
    data = []
    for file_name in os.listdir(directory):
        with open(directory + "/" + file_name, 'r', encoding="utf-8", errors='ignore') as file:
            # collect the email data into the data
            data.append(file.read())
    return data

In [50]:
# get all spam emails
spam_emails = load_file_from_directort('/tmp/enron1/spam')

# get all ham emails
ham_emails = load_file_from_directort('/tmp/enron1/ham')

#### collect the spam and ham words

In [51]:
def process_email(email):
    # collect all the words here
    all_words = []
    
    # remove the punctuation marks and special symbols
    symbols = ['~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', ':', ';',
                         '-', '_', '+', "'", '"', ',', '.', '[', ']', '<', '>']
    
    tmp_email = email
    
    # I love !India.~
    # I love !India.
    # I love India.
    # I love India
    for symbol in symbols:
        # replace all the symbols with ''
        tmp_email = tmp_email.replace(symbol, '')
        
    
    # split the emails into collection of words
    from nltk.tokenize import word_tokenize
    words = word_tokenize(tmp_email)
    
    # remove the numbers
    numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
    
    for word in words:
        tmp_word = word
        for number in numbers:
            # replace the numbers with ''
            tmp_word = tmp_word.replace(number, '')
        
        # 50 => ''
        # the minimum length of every word must be > 2
        if len(tmp_word) > 2:
            all_words.append(word)
    
    # convert every word to lowercase
    all_words = [w.lower() for w in all_words]
    
    # lemmatize the word
    all_words = [lemmatizer.lemmatize(w) for w in all_words]
    
    # remove the stop words
    all_words = [w for w in all_words if w not in stop_words]
    
    # return the words an its count
    import collections
    return collections.Counter(all_words).most_common()

In [55]:
# collect all the spam word
spam_words = [process_email(email) for email in spam_emails]

# collect all the ham word
ham_words = [process_email(email) for email in ham_emails]

#### collect the words with the class 

In [56]:
# table 
# word -> class/label
def create_row(word, label):
    dictionary = dict(word)
    
    # add the label
    dictionary['email_type'] = label
    
    return dictionary

In [57]:
# 1: spam
spam_words = [create_row(word, 1) for word in spam_words]

# 0: ham
ham_words = [create_row(word, 0) for word in ham_words]

In [58]:
# merge the spam and ham words into one table
all_words = spam_words + ham_words

import random

# shuffle the data
random.shuffle(all_words)

#### classification

In [60]:
df = pd.DataFrame(all_words)

In [63]:
print(df.head())

   gisb  contract  gas  please  enron  final  doc  intrastate  interstate  \
0  11.0      10.0  6.0     6.0    6.0    6.0  6.0         5.0         5.0   
1   0.0       0.0  2.0     0.0    0.0    0.0  0.0         0.0         0.0   
2   0.0       1.0  1.0     4.0    0.0    0.0  0.0         0.0         0.0   
3   0.0       0.0  0.0     1.0    0.0    0.0  0.0         0.0         0.0   
4   0.0       0.0  0.0     0.0    0.0    0.0  0.0         0.0         0.0   

   anthony  ...  germicide  rickettsia  afro  bloodhound  infantryman  \
0      4.0  ...        0.0         0.0   0.0         0.0          0.0   
1      0.0  ...        0.0         0.0   0.0         0.0          0.0   
2      0.0  ...        0.0         0.0   0.0         0.0          0.0   
3      0.0  ...        0.0         0.0   0.0         0.0          0.0   
4      0.0  ...        0.0         0.0   0.0         0.0          0.0   

   detention  tinkle  portico  raceway  restorative  
0        0.0     0.0      0.0      0.0      

### data cleansing process

In [62]:
# fill the missing values
df = df.fillna(0)

In [64]:
# decide x and y
x = df.drop('email_type', axis=1)
y = df['email_type']

In [65]:
# split the data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=123456)

### model building

In [66]:
from sklearn.linear_model import LogisticRegressionCV

model = LogisticRegressionCV(max_iter=1000)
model.fit(x_train, y_train)

LogisticRegressionCV(max_iter=1000)

In [67]:
# predict all the values
y_prediction = model.predict(x_test)

### Evaluation

In [69]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrixmatrix(y_test, y_prediction)
print(cm)

[[732  14]
 [  6 283]]


In [70]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       746
           1       0.95      0.98      0.97       289

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035

