In [81]:
import numpy as np
import pandas as pd
import os
import cleantext
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import xgboost

pd.set_option('max_columns', 200)

In [6]:
path_spam = 'datasets/spam/spam/'
path_eham = 'datasets/spam/easy_ham/'
spam_files = os.listdir(path_spam)
eham_files = os.listdir(path_eham)

Including all emails in spam and ham lists:

In [7]:
spam_list, eham_list = [],[]

for i in range(len(spam_files)):
    try:
        with open(path_spam + spam_files[i], 'r', encoding="utf8") as f:
            spam_list.append(f.read())
    except:
        pass

for j in range(len(eham_files)):
    try:
        with open(path_eham + eham_files[j], 'r', encoding="utf8") as g:
            eham_list.append(g.read())
    except:
        pass

Looking at few emails to propose a pipeline:

In [4]:
for i in range(3):
    em_0 = spam_list[i]

    first = em_0.find('\n\n')

    em_0 = em_0[first:]

    em_0 = em_0.lower()

    em_0 = cleantext.replace_urls(em_0, ' URL ')
    em_0 = cleantext.replace_emails(em_0, ' EMAIL ')
    em_0 = cleantext.replace_numbers(em_0, ' NUMBER ')

    em_0 = em_0.split()

    print('E-mail %i:'%i)
    print('number of words in the email:', len(em_0))
    print('Number of unique words in the email:', len(list(set(em_0))), '\n')

E-mail 0:
number of words in the email: 382
Number of unique words in the email: 241 

E-mail 1:
number of words in the email: 90
Number of unique words in the email: 64 

E-mail 2:
number of words in the email: 77
Number of unique words in the email: 54 



In [289]:
%timeit len(list(set(em_0)))
%timeit np.unique(em_0).shape[0]

2.13 µs ± 216 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
34.6 µs ± 4.89 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [5]:
def first_cleaning(em):
    first = em.find('\n\n')
    em = em[first:]
    em = em.lower()

    em = cleantext.replace_urls(em, ' URL ')
    em = cleantext.replace_emails(em, ' EMAIL ')
    em = cleantext.replace_numbers(em, ' NUMBER ')
    em = em.split()
    return em

def unique_list(em_list):
    em_list = first_cleaning(em_list)
    em_list_unique = list(set(em_list))
    return em_list_unique

In [6]:
all_unique_words, all_words = [],[]

for email in spam_list:
    all_unique_words += unique_list(email)
    all_words += first_cleaning(email)

print(len(all_unique_words))
print(len(all_words))
print(len(all_words) / len(all_unique_words))

81279
186848
2.2988471806985813


In [7]:
m = len(spam_list)
spam_list_splitted = []

for i in range(m):
    spam_list_splitted.append(first_cleaning(spam_list[i]))

In [374]:
df_unique = pd.DataFrame(all_unique_words, columns={'Words'}).drop_duplicates()
print(df_unique.shape)
df_unique.head()

(24302, 1)


Unnamed: 0,Words
0,new
1,buying
2,%
3,savings</b>
4,"style=3d""color:"


In [365]:
list(df_unique.value_counts().to_dict().items())[:3]

[(('\x1b$b!y!z!y!z!y!z!y!z!y!z!y!z!y!z!y!z!y!z!y!z!y!z!y!z!y\x1b(b',), 1),
 (('nm,',), 1),
 (('nnw771illnmiiv7mfg6pv1lpmvzrh5gaj3jgnz4yoltaddfj7lqo6v8bi0ymj63mdrz9evjjzztb',),
  1)]

In [8]:
df_all = pd.DataFrame(all_words, columns={'Words'})
df_all.shape

(186848, 1)

In [9]:
counts = df_all['Words'].value_counts()
df_all['counts'] = df_all['Words'].map(counts)

In [10]:
df_all.head()

Unnamed: 0,Words,counts
0,<!doctype,27
1,html,59
2,public,41
3,"""-//w3c//dtd",28
4,html,59


In [11]:
df_all[df_all['Words'] == 'NUMBER'].head()

Unnamed: 0,Words,counts
5,NUMBER,18111
11,NUMBER,18111
17,NUMBER,18111
37,NUMBER,18111
64,NUMBER,18111


In [12]:
df_all_unique = df_all.drop_duplicates()
print(df_all_unique.shape)
df_all_unique

(24302, 2)


Unnamed: 0,Words,counts
0,<!doctype,27
1,html,59
2,public,41
3,"""-//w3c//dtd",28
5,NUMBER,18111
...,...,...
186807,mailings.<br>,1
186827,parties.,1
186829,you.</font></td>,1
186831,"align=3d""right""><a",1


In [13]:
df_all_cleaned = df_all_unique[df_all_unique['counts'] > 1]
df_all_cleaned.shape

(9704, 2)

In [29]:
df = pd.DataFrame([], columns=df_all_cleaned['Words'].values)

print(df.shape)

for i in tqdm(range(len(spam_list_splitted))):
    sample = []
    for word in df.columns:
        if word in spam_list_splitted[i]:
            sample.append(1)
        else:
            sample.append(0)
    df.loc[len(df)] = sample

(0, 9704)


100%|██████████| 417/417 [00:43<00:00,  9.70it/s]


In [30]:
df.to_csv('datasets/spam/spam_df.csv')

In [35]:
df.head()

Unnamed: 0,<!doctype,html,public,"""-//w3c//dtd",NUMBER,"transitional//en"">",<html><head>,<meta,"content=3d""text/html;",charset=3dwindows-,"""","content=3d""mshtml",name=3dgenerator></head>,inserted,by,calypso,-->,<table,border=3d0,cellpadding=3d0,"style=3d""color:",black;,"none""","width=3d""","%"">",<tbody>,<tr>,<td,colspan=3d3>,<hr,color=3dblack,noshade,size=3d1>,end,--><font,color=3d#,color=3d#ff0000,"face=3d""copperplate",gothic,"bold""",size=3d5,"ptsize=3d""",""">",up,to,%,on,life,spend,more,than,you,have,<center><font,"size=3d""",quote,savings,<center>,<p,<p></p>,bordercolor=3d#,cellspacing=3d0,wi=,dth=3d650>,cellpadding=3d5,colspan=3d2,"%""><b><font",face=3dverdana,g,your,family's,financial,security,is,very,buying,insurance,simple,and,affordable.,we,provide,free,access,=,the,best,companies,lowest,align=3dmiddle,"style=3d""padding-left:",5px;,padding-right:,"5px""","%""><font","fast,",y,money!,let,us,...,tx</font><br>,rd.</p>,airport<br>,street</p>,mo</font><br>,<p>nov.,drive</p>,ca</font><br>,"color=3d""#cc3333"">san","color=3d""yellow"">",<br><center><b>get,dvds!,"bgcolor=3d""white""","""><center>click","href=3d""http://=",details!</a>,"bgcolor=3d""#ccff33""",<td><center><center><font,"color=3d""6633cc""><br>",<br>porno,from!<br><br>,<i>very,"offer</i>.""<br><br>",dvds,"free,<br>",with<a,"dex.html"">",commitment!</a>,anywhere</b>.<br>,<i>no,catches</i>,gimmicks</i>.,<br>you,"shipping,<br>",<b>absolutely,free</b>!<br><br>,peak,our<a,ull,catalog!</a>,"bgcolor=3d""yellow""",<td><br>,"color=3d""blue"">","""><b>high",titles,as:</b><br><br><c=,enter>,oral,cumshots,</b><br></a>,<b></center>description:</b>,shots!,jiz,face=,!<br>,mouth?<br<br,"""><b>dozens",as:</b><br><br><=,center>,penetrations,penetrations<br>,porn!<br<br,"""><b>from","""sexiest",innocent,"blondes""",collections:</b><=,br><br><center>,audition,tapes</b><br></a>,"cute,","innocent,",br>to,<i>screaming,goddess</i>,beggin',"tight,",<br>wet,pussies,asses!<=,br<br,"face=""times,",color=green,</font></center></table></tbody></tr></td>,lien,acre,parcel,"align=""middle""><font",><b>for,</font></b></tbody><tr></table>,>*</font></td>,"departments,",manuel,oko,lucio,harper,harper's,%),"/""><img",.114.135/
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Creating an automatic pipeline of preprocessing:

In [12]:
def first_cleaning(em):
    first = em.find('\n\n')
    em = em[first:]
    em = em.lower()

    em = cleantext.replace_urls(em, ' URL ')
    em = cleantext.replace_emails(em, ' EMAIL ')
    em = cleantext.replace_numbers(em, ' NUMBER ')
    em = em.split()
    return em

In [92]:
def split_email(email_list):
    m = len(email_list)
    email_list_splitted = []

    for i in range(m):
        email_list_splitted.append(first_cleaning(email_list[i]))
    return email_list_splitted

In [93]:
def all_words_func(email_list):
    all_words = []
    for email in email_list:
        all_words += first_cleaning(email)
    return all_words

In [94]:
def cleaning_df(all_words):
    df_all = pd.DataFrame(all_words, columns={'Words'})
    counts = df_all['Words'].value_counts()
    df_all['counts'] = df_all['Words'].map(counts)
    df_all_unique = df_all.drop_duplicates()
    df_all_cleaned = df_all_unique[df_all_unique['counts'] > 1]

    return df_all_cleaned

In [108]:
def make_df(spam_list_splitted, eham_list_splitted, df_all_cleaned):
    df = pd.DataFrame([], columns=df_all_cleaned['Words'].values)
    df['Target_y'] = None
    
    #spam
    for i in tqdm(range(len(spam_list_splitted))):
        sample = []
        for word in df_all_cleaned['Words'].values:
            if word in spam_list_splitted[i]:
                sample.append(1)
            else:
                sample.append(0)

        df.loc[len(df)] = sample + [1] #spam
    #eham
    for j in tqdm(range(len(eham_list_splitted))):
        sample = []
        for word in df_all_cleaned['Words'].values:
            if word in eham_list_splitted[j]:
                sample.append(1)
            else:
                sample.append(0)
        df.loc[len(df)] = sample + [0] #not-spam

    return df

In [96]:
def preprocessing_pipeline(spam_list_, eham_list_):
    spam_list_splitted = split_email(spam_list_)
    eham_list_splitted = split_email(eham_list_)
    all_words = all_words_func(spam_list_ + eham_list_)
    df_all_cleaned = cleaning_df(all_words)
    df = make_df(spam_list_splitted, eham_list_splitted, df_all_cleaned)
    return df

In [109]:
df = preprocessing_pipeline(spam_list, eham_list) #~23 minutes preprocessing

100%|██████████| 417/417 [02:09<00:00,  3.23it/s]
100%|██████████| 2410/2410 [20:38<00:00,  1.95it/s] 


In [110]:
df.to_csv('datasets/spam/df.csv')

In [112]:
df.shape

(2827, 28925)

In [141]:
data = df.to_numpy(dtype=np.int32)

In [144]:
X = data[:,:-1]
y = data[:,-1]

In [145]:
print(X.shape, y.shape)

(2827, 28924) (2827,)


In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=2)

In [198]:
y_ones = np.ones(y_test.shape[0])
y_zeros = np.zeros(y_test.shape[0])

print('Accuracy if all samples are considered SPAM: %.2f'%accuracy_score(y_test, y_ones))
print('Accuracy if all samples are considered no-SPAM: %.2f'%accuracy_score(y_test, y_zeros))
print('F1 score if all samples are considered no-SPAM: %.2f'%f1_score(y_test, y_zeros))

Accuracy if all samples are considered SPAM: 0.15
Accuracy if all samples are considered no-SPAM: 0.85
F1 score if all samples are considered no-SPAM: 0.00


Random Forest

In [147]:
rf_clf = RandomForestClassifier()

In [149]:
cross_val_score(rf_clf, X_train, y_train, cv=3)

array([0.98408488, 0.9801061 , 0.97609562])

In [152]:
rf_clf.fit(X_train, y_train)
rf_clf.score(X_test, y_test)

0.9858657243816255

In [153]:
y_rf_pred = rf_clf.predict(X_test)

In [154]:
f1_score(y_test, y_rf_pred)

0.9512195121951219

In [155]:
confusion_matrix(y_test, y_rf_pred)

array([[480,   1],
       [  7,  78]], dtype=int64)

In [167]:
param_grid = [{'n_estimators': [100, 250, 300, 400], 'max_features': ['sqrt', 4, 10, 20], 'max_depth': [None, 10, 100]}]

In [168]:
grid_search = GridSearchCV(rf_clf, param_grid, cv=3)
grid_search.fit(X_train, y_train) #~22 minutes

In [169]:
grid_search.best_params_

{'max_depth': 100, 'max_features': 'sqrt', 'n_estimators': 100}

In [170]:
rf_best_clf = grid_search.best_estimator_

In [171]:
rf_best_clf.score(X_test, y_test)

0.9876325088339223

In [172]:
y_rf_best_pred = rf_best_clf.predict(X_test)
f1_score(y_test, y_rf_best_pred)

0.9575757575757576

XGBoost

In [187]:
xg_clf = xgboost.XGBClassifier()

In [188]:
cross_val_score(xg_clf, X_train, y_train, cv=3)

array([0.98938992, 0.9933687 , 0.98937583])

In [199]:
xg_clf.fit(X_train, y_train)

In [200]:
xg_clf.score(X_test, y_test)

0.9893992932862191

In [201]:
y_xg_pred = xg_clf.predict(X_test)

In [203]:
f1_score(y_test, y_xg_pred)

0.963855421686747

In [202]:
confusion_matrix(y_test, y_xg_pred)

array([[480,   1],
       [  5,  80]], dtype=int64)

Reading the dataset, including the email size, and tuning XGBoost hyperparameters:

In [3]:
data_csv = pd.read_csv('datasets/spam/df.csv')

In [4]:
data_csv.head()

Unnamed: 0.1,Unnamed: 0,<!doctype,html,public,"""-//w3c//dtd",NUMBER,"transitional//en"">",<html><head>,<meta,"content=3d""text/html;",charset=3dwindows-,"""","content=3d""mshtml",name=3dgenerator></head>,inserted,by,calypso,-->,<table,border=3d0,cellpadding=3d0,"style=3d""color:",black;,display:,"none""","width=3d""","%"">",<tbody>,<tr>,<td,colspan=3d3>,<hr,color=3dblack,noshade,size=3d1>,end,--><font,color=3d#,color=3d#ff0000,"face=3d""copperplate",gothic,"bold""",size=3d5,"ptsize=3d""",""">",up,to,%,on,life,spend,more,than,you,have,to?,<center><font,"size=3d""",quote,savings,<center>,<p,<p></p>,bordercolor=3d#,cellspacing=3d0,wi=,dth=3d650>,cellpadding=3d5,colspan=3d2,"%""><b><font",face=3dverdana,g,your,family's,financial,security,is,very,important.,buying,insurance,simple,and,affordable.,we,provide,free,access,=,the,best,companies,lowest,align=3dmiddle,"style=3d""padding-left:",5px;,padding-right:,"5px""","%""><font","fast,",...,d'ya,future-proof,devices&#,accessibility[,w3c,accessibility,1pttrans,robots,intercepted,camworld[,ageing,-05t03:,*society:*,crimes,museo,mcfarlane,flight.,gallery,-06t02:,-05t21:,attend.,camping,seth,curling,mullah,-06t23:,ada[,southwest,miami,tick,photographer,wedding,_michael,barrish_:,;&#,-06t18:,treatments,plastic,-07t03:,ubi,virtools,iain,assume),video_ts,menu.,watch.,ctrl,"]"",",cygwin[,tabs,search[,_emacs,backed-up,(contains,delphi,*simon,burglar,alarm.,mt-search,"""alttemplate"".",-08t16:,scissors,scissors?,wedge,"block,",admiral's,"""going",-08t12:,evolvable,parsers,entity-encoded,extensibility,"namespaces,",right&#,lego,shower,wastebasket,bass,-09t03:,spies,conference:*,"*""why",reversable?,"""stop","iis""",outcode,inexperienced,"""live",guil:,ros:,loop&#,off-by-,runner.,runner,"mile,",mile.,-10t03:,-09t22:,molecules,Target_y
0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [16]:
X_size = []

for email in spam_list:
    size = len(first_cleaning(email))
    X_size.append(size)

for email in eham_list:
    size = len(first_cleaning(email))
    X_size.append(size)

m = len(X_size)
print(m, data_csv.shape[0])

2827 2827


In [42]:
df = data_csv.copy()

In [43]:
df.insert(df.shape[1]-1, 'Email_size', X_size)

In [49]:
df.iloc[:,-2:].head()

Unnamed: 0,Email_size,Target_y
0,382,1
1,90,1
2,77,1
3,429,1
4,75,1


In [52]:
df_numpy = df.to_numpy()
X, y = df_numpy[:,1:-1], df_numpy[:,-1]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=2)

In [66]:
xgb_clf = xgboost.XGBClassifier()

In [71]:
param_grid = {'eta':[0.1,0.3], 'max_depth':[3,6]}

grid_search = GridSearchCV(xgb_clf, param_grid, cv=5, verbose=3, return_train_score=True)

In [72]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END eta=0.1, max_depth=3;, score=(train=0.997, test=0.987) total time= 1.0min
[CV 2/5] END eta=0.1, max_depth=3;, score=(train=0.996, test=0.993) total time=  57.7s
[CV 3/5] END eta=0.1, max_depth=3;, score=(train=0.996, test=0.987) total time=  57.3s
[CV 4/5] END eta=0.1, max_depth=3;, score=(train=0.997, test=0.987) total time=  52.9s
[CV 5/5] END eta=0.1, max_depth=3;, score=(train=0.996, test=0.989) total time= 1.0min
[CV 1/5] END eta=0.1, max_depth=6;, score=(train=0.999, test=0.985) total time= 1.6min
[CV 2/5] END eta=0.1, max_depth=6;, score=(train=0.999, test=0.996) total time= 1.8min
[CV 3/5] END eta=0.1, max_depth=6;, score=(train=0.999, test=0.991) total time= 1.7min
[CV 4/5] END eta=0.1, max_depth=6;, score=(train=0.999, test=0.987) total time= 1.4min
[CV 5/5] END eta=0.1, max_depth=6;, score=(train=1.000, test=0.991) total time= 1.5min
[CV 1/5] END eta=0.3, max_depth=3;, score=(train=0.999, test=0.987) to

In [73]:
grid_search.best_params_

{'eta': 0.3, 'max_depth': 3}

In [76]:
xgb_best_clf = grid_search.best_estimator_

In [77]:
xgb_best_clf.score(X_test, y_test)

0.9946996466431095

In [78]:
y_xg_pred = xgb_best_clf.predict(X_test)

In [79]:
f1_score(y_test, y_xg_pred)

0.9820359281437125

In [80]:
confusion_matrix(y_test, y_xg_pred)

array([[481,   0],
       [  3,  82]], dtype=int64)

In [82]:
precision_score(y_test, y_xg_pred)

1.0

In [83]:
recall_score(y_test, y_xg_pred)

0.9647058823529412

Saving the model:

In [84]:
xgb_best_clf.save_model('datasets/spam/xgb_model.txt')

Spliting the database into peaces:

In [104]:
df_csv = pd.read_csv('datasets/spam/df.csv')

In [106]:
m,n = df.shape
print(m,n)

2827 28927


In [120]:
j = 0
for i in range(int(m/5), m, int(m/5)):
    df_new = df[j:i]
    df_new.to_csv('datasets/spam/df/df_%i.csv'%j, index=False)
    j = i