In [100]:
import pandas as pd
import email,nltk,re,os,numpy as np
from sklearn.model_selection import train_test_split
from IPython.display import clear_output
from scipy.sparse import hstack

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [101]:
stemmer = nltk.stem.PorterStemmer()
stopwords = set(nltk.corpus.stopwords.words('english'))
wordlists = set(nltk.corpus.words.words())

In [102]:
dataset=pd.read_csv('data/train/labels.csv')

In [103]:
main_path= 'data/'


In [104]:
def Preprocess(file_path):
    print(file_path)
   
    
    emails = email.message_from_string(open(file_path, encoding = 'ISO-8859-1').read())
    email_subject=emails['Subject']
    
    # Get E-mail body
    if emails.is_multipart():
        for payload in emails.get_payload():
            # if payload.is_multipart() then.. 
            email_body = payload.get_payload()
    else:
        email_body = emails.get_payload()
    
    # If body has no value
    if type(email_body) is not str:
        # If body has no value then make it an empty string
        if not email_body:
            email_body = ""
        # If body was multipart then the payload will be 
        # returned as an array with the first index as 
        # the actual message in the mail
        elif type(email_body) is list:
            email_body = email_body[0].as_string()
            
    if type(email_subject) is not str:
        # If body has no value then make it an empty string
        if not email_subject:
            email_subject = ""
    #count number of special characters
    email_subject_count = len(email_subject) - len( re.findall('[\w]', email_subject) )
    
    # Remove numbers and all special characters except space
    email_body = re.sub(r'[^a-zA-Z]', ' ', email_body).lower()
    
    #Remove foreign words
    email_body = " ".join(w for w in nltk.wordpunct_tokenize(email_body) if w.lower() in wordlists or not w.isalpha())
    
    email_ = [word for word in email_body.split() if word not in stopwords]
    email_stemmed = " ".join([stemmer.stem(word) for word in email_])
    # Remove single letter words like 'b', 'j', etc..
    email_cleaned = re.sub(r"\b[a-zA-Z]\b", "", email_body)
    email_body_cleaned = " ".join(email_cleaned.split())
    
    email_body_lenth=len(email_body_cleaned)
    
    email_return_path=emails['Return-Path']
    if email_return_path == None:
        email_return_path=1
    else:
        email_return_path=0
    clear_output()
    return (email_cleaned,email_return_path,email_subject_count,email_body_lenth)

In [105]:
dat = dataset.apply(lambda row: Preprocess(main_path+row['Id']), axis=1).apply(pd.Series)

In [106]:
dataset['email'],dataset['email_return_path'],dataset['email_subject_count'],dataset['email_body_lenth']=dat[0],dat[1],dat[2],dat[3]

In [8]:
dataset['email'] = dataset['email'].replace(np.nan, '', regex=True) 

In [9]:
dataset.head()

Unnamed: 0,Id,Label,email,email_return_path,email_subject_count,email_body_lenth
0,train/000/000,ham,user id original message from sent june to sub...,0,13,9069
1,train/000/002,ham,these new original message from sent june am...,0,3,343
2,train/000/003,ham,we are currently trading under spot with el fo...,0,5,914
3,train/000/004,ham,and attached is for new master physical with...,0,2,228
4,train/000/005,ham,below is copy of my communication with regard...,0,6,698


In [10]:
# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(dataset, dataset['Label'], test_size=0.20, random_state = 20)

In [124]:
# Create the Vectorizer vocabulary
vectorizer = CountVectorizer()
vectorizer.fit(dataset['email'])

# len(vectorizer.vocabulary_)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [12]:
# Count Vectors
X_train_vector = vectorizer.transform(X_train['email'])
X_val_vector = vectorizer.transform(X_val['email'])

# print(X_train_vec.shape, X_val_vec.shape)

In [13]:
# TF Vector
tf_transformer = TfidfTransformer(norm='l1', use_idf=False)

X_train_vector_tf = tf_transformer.fit_transform(X_train_vector)
X_val_vector_tf = tf_transformer.fit_transform(X_val_vector)

# print(X_train_vec_tf.shape, X_val_vec_tf.shape)

In [14]:
# TF-IDF Vector
tfidf_transformer = TfidfTransformer(norm='l1', use_idf=True)

X_train_vector_tfidf = tfidf_transformer.fit_transform(X_train_vector)
X_val_vector_tfidf = tfidf_transformer.fit_transform(X_val_vector)

# print(X_train_vec_tfidf.shape, X_val_vec_tfidf.shape)

#### Classify with train,validation data-Naive-Bayes

In [16]:
# Naive-Bayes Classifier
classifier = MultinomialNB()

# Naive-Bayes with Count
classifier.fit(X_train_vector, y_train)
predictions = classifier.predict(X_val_vector)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Naive-Bayes with Count')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)


Naive-Bayes with Count
Accuracy Score: 0.8671450970932336
Precision: 0.9287388654477262
False Positive Rate (FPR): 0.08608240124592949


In [17]:
# Naive-Bayes with Term Frequency
classifier.fit(X_train_vector_tf, y_train)
predictions = classifier.predict(X_val_vector_tf)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Naive-Bayes with TF')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)


Naive-Bayes with TF
Accuracy Score: 0.8900012061271259
Precision: 0.8433122155795485
False Positive Rate (FPR): 0.24861956675633584


In [18]:
# Naive-Bayes with Term Frequency-Inverse Document Frequency
classifier.fit(X_train_vector_tfidf, y_train)
predictions = classifier.predict(X_val_vector_tfidf)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()
Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Naive-Bayes with TF-IDF')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)


Naive-Bayes with TF-IDF
Accuracy Score: 0.907610662163792
Precision: 0.8666115854218306
False Positive Rate (FPR): 0.20571994903015717


#### Classify with train,validation data-SVM

In [19]:
# Support Vector Machine Classifier
classifier = LinearSVC(random_state = 0)

# SVM with Count
classifier.fit(X_train_vector, y_train)
predictions = classifier.predict(X_val_vector)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('SVM with Count')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

SVM with Count
Accuracy Score: 0.9542877819322156
Precision: 0.9430565388894508
False Positive Rate (FPR): 0.07971117089055642




In [20]:
# SVM with Term Frequency
classifier.fit(X_train_vector_tf, y_train)
predictions = classifier.predict(X_val_vector_tf)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('SVM with TF')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

SVM with TF
Accuracy Score: 0.9419852852490652
Precision: 0.9565681357379149
False Positive Rate (FPR): 0.057624238991929776


In [21]:
# SVM with Term Frequency-Inverse Document Frequency
classifier.fit(X_train_vector_tfidf, y_train)
predictions = classifier.predict(X_val_vector_tfidf)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('SVM with TF-IDF')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

SVM with TF-IDF
Accuracy Score: 0.9476540827403208
Precision: 0.9344179973887717
False Positive Rate (FPR): 0.09245363160130256


#### Vectorizer with new features

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(dataset['email'])
# len(vectorizer.vocabulary_)

In [None]:
data_vec = vectorizer.transform(dataset['email'])
# data_vec.shape

#### With 1 Feature

In [50]:
data_vec1=hstack((data_vec,dataset[['email_return_path']]))

In [51]:
# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(data_vec1, dataset['Label'], test_size=0.20, random_state = 20)

#### Classifier

In [52]:
# Naive-Bayes
classifier = MultinomialNB().fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Naive-Bayes with feature1')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)


Naive-Bayes with feature1
Accuracy Score: 0.8979616451573996
Precision: 0.9327656751078182
False Positive Rate (FPR): 0.08608240124592949


In [53]:
# Linear SVM
classifier = LinearSVC(random_state = 0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('SVM with feature1')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

SVM with feature1
Accuracy Score: 0.9876371969605596
Precision: 0.9899011150852094
False Positive Rate (FPR): 0.01359195809146255




In [54]:
# Decision Tree Classifier
classifier = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Decision Tree with feature1')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

Decision Tree with feature1
Accuracy Score: 0.9887227113737788
Precision: 0.9892010903753408
False Positive Rate (FPR): 0.014583038368965028


#### With feature 2

In [55]:
data_vec2=hstack((data_vec,dataset[['email_subject_count']]))

In [56]:
# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(data_vec2, dataset['Label'], test_size=0.20, random_state = 20)

In [57]:
# Naive-Bayes
classifier = MultinomialNB().fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Naive-Bayes with feature2')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)


Naive-Bayes with feature2
Accuracy Score: 0.8808346399710529
Precision: 0.9041037179899282
False Positive Rate (FPR): 0.12671669262353108


In [58]:
# Linear SVM
classifier = LinearSVC(random_state = 0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('SVM with feature2')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

SVM with feature2
Accuracy Score: 0.9617054637558798
Precision: 0.9655208551666318
False Positive Rate (FPR): 0.04658077304261645




In [59]:
# Decision Tree Classifier
classifier = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Decision Tree with feature2')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

Decision Tree with feature2
Accuracy Score: 0.9457845856953323
Precision: 0.947188213322266
False Positive Rate (FPR): 0.07206569446410874


#### With feature 3

In [66]:
data_vec3=hstack((data_vec,dataset[['email_body_lenth']]))

In [67]:
# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(data_vec3, dataset['Label'], test_size=0.20, random_state = 20)

In [68]:
# Naive-Bayes
classifier = MultinomialNB().fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Naive-Bayes with feature3')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)


Naive-Bayes with feature3
Accuracy Score: 0.8672657098058135
Precision: 0.9288560712611346
False Positive Rate (FPR): 0.08594081834914342


In [69]:
# Linear SVM
classifier = LinearSVC(random_state = 0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('SVM with feature3')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

SVM with feature3
Accuracy Score: 0.9337836207936316
Precision: 0.9449434640177533
False Positive Rate (FPR): 0.07376468922554155




In [70]:
# Decision Tree Classifier
classifier = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Decision Tree with feature3')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

Decision Tree with feature3
Accuracy Score: 0.9429501869497044
Precision: 0.956642164695856
False Positive Rate (FPR): 0.057624238991929776


#### With all Features

In [139]:
data_vec_full=hstack((data_vec,dataset[['email_subject_count','email_body_lenth','email_return_path']]))

In [72]:
# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(data_vec_full, dataset['Label'], test_size=0.20, random_state = 20)

In [73]:
# Naive-Bayes
classifier = MultinomialNB().fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Naive-Bayes with all feature')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)


Naive-Bayes with all feature
Accuracy Score: 0.8968761307441805
Precision: 0.9070155321588659
False Positive Rate (FPR): 0.12629194393317286


In [74]:
# Linear SVM
classifier = LinearSVC(random_state = 0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('SVM with all feature')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

SVM with all feature
Accuracy Score: 0.9334820890121819
Precision: 0.9917036690815612
False Positive Rate (FPR): 0.010052385671810845




In [75]:
# Decision Tree Classifier
classifier = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
predictions = classifier.predict(X_val)
tn, fp, fn, tp = metrics.confusion_matrix(y_val, predictions).ravel()

Accuracy=(tp + tn) / (tp + tn + fp + fn)
Precision=(tp / (tp + fp))
FPR=(fp / (fp + tn))

print('Decision Tree with all feature')
print("Accuracy Score:",Accuracy)
print("Precision:",Precision)
print("False Positive Rate (FPR):",FPR)

Decision Tree with all feature
Accuracy Score: 0.9881799541671692
Precision: 0.9891908909644244
False Positive Rate (FPR): 0.014583038368965028


#### Train SVM with full train dataset

In [140]:
# Linear SVM
classifier = LinearSVC(random_state = 0).fit(data_vec_full, dataset['Label'])

In [108]:
# test paths
path = main_path + 'test'
file_paths = pd.Series(['test/' + filename + '/' + fname for filename in os.listdir(path) for fname in os.listdir(main_path + 'test/' + filename)])

In [109]:
# test emails using paths
test_data = file_paths.apply(lambda row: Preprocess(main_path + row)).apply(pd.Series)
test_data.columns = ['email', 'email_return_path','email_subject_count','email_body_lenth']

In [110]:
test_data.head()

Unnamed: 0,email,email_return_path,email_subject_count,email_body_lenth
0,hi tonight we are rolling out new report curr...,0,3,409
1,mark am working with the east power desk to p...,0,2,372
2,mark and is ready to bill us for the oil but t...,0,2,354
3,per eric moon attached you will find the slide...,0,0,72
4,return path from full name message id date...,1,11,1902


In [141]:
test_data['email'] = test_data['email'].replace(np.nan, '', regex=True) 

test_data_vector = vectorizer.transform(test_data['email'])

test_data_vector_full = hstack((test_data_vector, test_data[['email_return_path','email_subject_count','email_body_lenth']].to_numpy()))

In [142]:
test_data['Label'] = classifier.predict(test_data_vector_full)

In [143]:
submission = pd.DataFrame({
    'Id': file_paths,
    'Label': test_data['Label']
})

submission.to_csv('submission.csv', index = False)