In [29]:
import pandas as pd
import email,re, nltk,os
from sklearn.model_selection import train_test_split
from IPython.display import clear_output
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

import numpy as np
wordlists = set(nltk.corpus.words.words())

In [2]:
dataset=pd.read_csv('data/train/labels.csv')

In [3]:
main_path= 'data/'


In [12]:
def Preprocess(file_path):
    print(file_path)
    email_body,reciever = "",""
    
    emails = email.message_from_string(open(file_path, encoding = 'ISO-8859-1').read())
    
    reciever=emails['To']
    # Get E-mail body
    if emails.is_multipart():
        for payload in emails.get_payload():
            # if payload.is_multipart() then.. 
            email_body = payload.get_payload()
    else:
        email_body = emails.get_payload()
    
    # If body has no value
    if type(email_body) is not str:
        # If body has no value then make it an empty string
        if not email_body:
            email_body = ""
        # If body was multipart then the payload will be 
        # returned as an array with the first index as 
        # the actual message in the mail
        elif type(email_body) is list:
            email_body = email_body[0].as_string()
    
    if type(reciever) is not str:
        # If body has no value then make it an empty string
        if not reciever:
            reciever = ""
    # Remove numbers and all special characters except space
    email_body = re.sub(r'[^a-zA-Z]', ' ', email_body).lower()
    reciever = re.sub(r'[^a-zA-Z]', ' ', reciever).lower()
    
    #Remove foreign words
    email_body = " ".join(w for w in nltk.wordpunct_tokenize(email_body) if w.lower() in wordlists or not w.isalpha())
    
    # Remove single letter words like 'b', 'j', etc..
    email_cleaned = re.sub(r"\b[a-zA-Z]\b", "", email_body)
    reciever = re.sub(r"\b[a-zA-Z]\b", "", reciever)
    clear_output()
    return (email_cleaned,reciever)

In [13]:
dat = dataset.apply(lambda row: Preprocess(main_path+row['Id']), axis=1).apply(pd.Series)
dataset['email'],dataset['email_recipients']=dat[0],dat[1]

In [14]:
X_train, X_validation, y_train, y_validation = train_test_split(dataset, dataset[['Label']], test_size=0.20, random_state = 20)

In [16]:

def train( X_train, y_train,word_percentage):
    frequent_spam_words = None
    frequent_ham_words = None
    
       
    # Get all words in spam and ham emails
    spam_words = X_train['email'][y_train['Label'] == 'spam'].str.cat().split()
    ham_words = X_train['email'][y_train['Label'] == 'ham'].str.cat().split()
    
    length_recipients=X_train['email_recipients'].str.len()
    uniq_words = len(set(X_train['email'].str.cat().split()))

    # % of the most frequent spam and words
    frequent_spam_words = Counter(spam_words).most_common(round(uniq_words*(word_percentage/100)))
    frequent_spam_words = [w[0] for w in frequent_spam_words]
        
    frequent_ham_words = Counter(ham_words).most_common(round(uniq_words*(word_percentage/100)))
    frequent_ham_words = [w[0] for w in frequent_ham_words]
        
    return (frequent_spam_words,frequent_ham_words,length_recipients)

   

In [40]:
 def predict( X_vals,spam_words,ham_words,recipients):
    prediction = []
        
            
    size = X_vals.shape[0]
    s = 0
    for mail,reciever in zip(X_vals.email,recipients):
        spam_count = 0
        ham_count = 0
        length_reciever=len(str(reciever))       
        unique_words_in_email = set(mail.split())
                
        # Calculate number of spam and ham matches with test set words
        for word in unique_words_in_email:
            spam_count += 1 if word in spam_words else 0
            ham_count += 1 if word in ham_words else 0
                
        # If there are more spam matches than ham, then predict as spam, otherwise predict as ham.
        #if reciever length is less than 40 then add as spam or else ham
        if(spam_count > ham_count) and length_reciever <40:
            prediction.append('spam')
        else:
            prediction.append('ham')
                
        clear_output()
        s += 1
        print(str(s)+" out of "+str(size)+"("+str(100*s/size)+"%)")
                
      
        
    return prediction        

In [18]:
words=train(X_train, y_train,word_percentage=7)

In [26]:
predictions = predict(X_validation,words[0],words[1],words[2])

16582 out of 16582(100.0%)


In [27]:
cnf_matrix = confusion_matrix(y_validation, predictions)

FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
FPR = FP/(FP+TN)

print("Accuracy Score:", accuracy_score(y_validation, predictions))
print("Precision (Macro | Micro):", precision_score(y_validation, predictions, average='macro'), " | ", precision_score(y_validation, predictions, average='micro'))
print("False Positive Rate (FPR):", FPR[0], " | ", FPR[1])



Accuracy Score: 0.8073814980098902
Precision (Macro | Micro): 0.8290914465734385  |  0.8073814980098902
False Positive Rate (FPR): 0.30538922155688625  |  0.040634291377601585


Test

In [31]:
# test paths
path = main_path + 'test'
file_paths = pd.Series(['test/' + filename + '/' + fname for filename in os.listdir(path) for fname in os.listdir(main_path + 'test/' + filename)])

In [44]:
# test emails using paths
test_data = file_paths.apply(lambda row: Preprocess(main_path + row)).apply(pd.Series)
test_data.columns = ['email', 'email_recipients']
test_data.head()

Unnamed: 0,0,1
0,hi tonight we are rolling out new report curr...,lambie chris chris lambie enron com
1,mark am working with the east power desk to p...,taylor mark legal mark taylor enron com
2,mark and is ready to bill us for the oil but t...,weldon charles charles weldon enron com...
3,per eric moon attached you will find the slide...,garberding michael michael garberding enro...
4,return path from full name message id date...,baughman jr don don baughman enron com ...


In [35]:
# Train with all data
total_train=train(dataset, dataset[['Label']],word_percentage=5)

In [48]:
# Predictions for Test Data
predictions = predict(test_data,total_train[0],total_train[1],total_train[2])

9283 out of 9283(100.0%)


In [49]:
# Create the submission file for Kaggle
submission = pd.DataFrame({
    'Id': file_paths,
    'Label': predictions
})

submission.to_csv('submission.csv', index = False)