In [1]:
import nltk
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")


In [2]:
#to explore this package a little bit
#nltk.download()

tokenize: it will just take a sentence and split it into words which will make python to easier to understand it.

nltk.corpus: NLTK corpus is a massive dump of all kinds of natural language data sets

In [3]:
#stopwords
from nltk.corpus import stopwords
stopwords.words("english")[0:10] #for english language stopwords, and we're printing only first 10 stopwords
#stopwords are the words which occur a lot but dont contribute much in analysis

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### Dealing with text data

In [4]:
rawData= pd.read_csv('C:/Users/Prachi Dhamale/Documents/spam.csv',encoding="ISO-8859-1")

In [5]:
rawData.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
rawData.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [7]:
df=rawData.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)

In [8]:
df.shape

(5572, 2)

In [9]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [10]:
df.groupby("v1").size()

v1
ham     4825
spam     747
dtype: int64

### Machine Learning Pipeline

1.raw text - model cant distinguish words

2.Tokenize - tell the model what to look at

3.Clean text - remove stop words/punctuation,stemming,etc

 4.vectorize - convert to numeric form, one row for text message and one column for per word, so basically it place 1 when the word occurs in text message

### 1.Removing the punctuation

In [11]:
import string #string package has a list of punctuation in it

In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
"I like nlp." == "I like nlp"

False

In [14]:
def remove_punct(text):
    text_nopunct = []
    for char in text:
        if char not in string.punctuation:
            text_nopunct.append(char)
    return "".join(text_nopunct)

#text_nonpunct = "".join([char for char in text if char not in string.puctuation])

In [15]:
df["body_text_clean"]= df["v2"].apply(lambda x: remove_punct(x))

In [16]:
df.head()

Unnamed: 0,v1,v2,body_text_clean
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


### 2.Tokenize:

In [17]:
import re

def tokenize(text):
    tokens = re.split('\W+',text)
    return tokens

In [18]:
df["body_text_tokenized"]= df["body_text_clean"].apply(lambda x: tokenize(x.lower()))

### 3.Remove Stopwords

In [19]:
stopwords= nltk.corpus.stopwords.words('english')

In [20]:
def remove_stopwords(tokenized_list):
    text=[word for word in tokenized_list if word not in stopwords]
    return text

In [21]:
df["body_text_nostop"]= df["body_text_tokenized"].apply(lambda x: remove_stopwords(x))

In [22]:
df.head()

Unnamed: 0,v1,v2,body_text_clean,body_text_tokenized,body_text_nostop
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t..."


### Stemming

Stemming: Process of reducing inflected (or sometimes derived)words from their word stem or root. (removing sufix or prefix)

Why? reduce the corpus of words model is exposed to and explicitly correlates the words with same meanings

Test out Porter stemmer

In [23]:
ps = nltk.PorterStemmer()

In [24]:
print(ps.stem('grows'))
print(ps.stem('growing'))
print(ps.stem('grow'))

grow
grow
grow


In [25]:
print(ps.stem('run'))
print(ps.stem('running'))
print(ps.stem('runner'))

run
run
runner


In [26]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [27]:
df["body_text_stemmed"]= df["body_text_nostop"].apply(lambda x: stemming(x))

In [28]:
df.head()

Unnamed: 0,v1,v2,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho..."


### Lemmatizer:

Stemming vs Lemmatizer

In [29]:
wn = nltk.WordNetLemmatizer()

In [30]:
#stemming
print(ps.stem('meanning'))
print(ps.stem('meanness'))

mean
mean


In [31]:
#lemmatizer
print(wn.lemmatize('meanning'))
print(wn.lemmatize('meanness'))

meanning
meanness


Lemmatizer was able to distinguish between "meanning" and "meanness"

In [32]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [33]:
df["body_text_lemmatized"]= df["body_text_nostop"].apply(lambda x: lemmatizing(x))

In [34]:
df.head()

Unnamed: 0,v1,v2,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]"


In [35]:
df.shape

(5572, 7)

### Vectorizing 

1. Count Vectorizer

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
vectorizer = CountVectorizer(lowercase=False)
X_counts = vectorizer.fit_transform(str(i) for i in df["body_text_lemmatized"])
X_counts.shape 

(5572, 8829)

In [38]:
vectorizer.get_feature_names()

['008704050406',
 '0089my',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '020603',
 '0207',
 '02070836089',
 '02072069400',
 '02073162414',
 '02085076972',
 '020903',
 '021',
 '050703',
 '0578',
 '06',
 '060505',
 '061104',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '071104',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '0784987',
 '0789xxxxxxx',
 '0794674629107880867867',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073'

so this matrics shape denotes that you have 5572 text messages(rows) and 8829 unique words (columns)

In [39]:
x_counts_df=pd.DataFrame(X_counts.toarray())

In [40]:
my_columns=vectorizer.get_feature_names()
x_counts_df.columns = my_columns

In [41]:
x_counts_df.head()

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,ìï,ìïll,ûthanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Vectorizing Raw Data: N-grams

N Grams: Creates a document-term matrix where counts still occupy the cell but instead of the columns representing single terms, they represent all combinations of adjacent words of lenght n in your text

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
ngram_vectorizer = CountVectorizer(lowercase=False,ngram_range=(2,2))
X_counts = ngram_vectorizer.fit_transform(str(i) for i in df["body_text_lemmatized"])
X_counts.shape 

(5572, 31500)

In [44]:
ngram_vectorizer.get_feature_names()

['008704050406 sp',
 '0089my last',
 '0121 2025050',
 '01223585236 xx',
 '01223585334 cum',
 '0125698789 ring',
 '02 user',
 '020603 2nd',
 '0207 153',
 '02072069400 bx',
 '02073162414 cost',
 '02085076972 reply',
 '020903 2nd',
 '021 3680',
 '021 3680offer',
 '050703 tcsbcm4235wc1n3xx',
 '06 good',
 '07046744435 arrange',
 '07090298926 reschedule',
 '07099833605 reschedule',
 '07123456789 87077',
 '0721072 find',
 '07732584351 rodger',
 '07734396839 ibh',
 '07742676969 show',
 '07753741225 show',
 '0776xxxxxxx uve',
 '077xxx 2000',
 '07801543489 guaranteed',
 '07808 xxxxxx',
 '07808247860 show',
 '07808726822 awarded',
 '07815296484 show',
 '0784987 show',
 '0789xxxxxxx today',
 '0796xxxxxx today',
 '07973788240 show',
 '07xxxxxxxxx 2000',
 '07xxxxxxxxx show',
 '0800 0721072',
 '0800 169',
 '0800 18',
 '0800 195',
 '0800 1956669',
 '0800 505060',
 '0800 542',
 '08000407165 18',
 '08000776320 reply',
 '08000839402 2stoptx',
 '08000839402 2stoptxt',
 '08000839402 call',
 '08000839402 ca

In [45]:
x_counts_df=pd.DataFrame(X_counts.toarray())

In [46]:
my_columns=ngram_vectorizer.get_feature_names()
x_counts_df.columns = my_columns

In [47]:
x_counts_df.head()

Unnamed: 0,008704050406 sp,0089my last,0121 2025050,01223585236 xx,01223585334 cum,0125698789 ring,02 user,020603 2nd,0207 153,02072069400 bx,...,ûò address,ûò entertaining,ûò even,ûò favour,ûò getting,ûò hope,ûò indeed,ûò limping,ûò sound,ûówell done
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


so this matrics shape denotes that you have 5572 text messages(rows) and 31500 unique cobinition of two words (columns)

### TFIDF

TFIDF: w(i,j)= tf(i,j)*log(N/(df(i)))
    
    tf(i,j)=number of times i occurs in j divided by total number of terms in j
    df(i)= number of documents containing i
    N= total number of documents

rarer the word is the higher the value of weight going to be

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
tfidf_vectorizer = TfidfVectorizer(lowercase=False)
X_counts = tfidf_vectorizer.fit_transform(str(i) for i in df["body_text_lemmatized"])
X_counts.shape 

(5572, 8829)

In [50]:
tfidf_vectorizer.get_feature_names()

['008704050406',
 '0089my',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '020603',
 '0207',
 '02070836089',
 '02072069400',
 '02073162414',
 '02085076972',
 '020903',
 '021',
 '050703',
 '0578',
 '06',
 '060505',
 '061104',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '071104',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '0784987',
 '0789xxxxxxx',
 '0794674629107880867867',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073'

In [51]:
x_counts_df=pd.DataFrame(X_counts.toarray())

In [52]:
my_columns=tfidf_vectorizer.get_feature_names()
x_counts_df.columns = my_columns

In [53]:
x_counts_df.head()

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,ìï,ìïll,ûthanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
df.head()

Unnamed: 0,v1,v2,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]"


### Random Forest Classifier 

In [55]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix  
from sklearn.model_selection import train_test_split

In [56]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in columns. 
df["v1"]= label_encoder.fit_transform(df["v1"])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(x_counts_df,df["v1"],test_size=0.2)

In [58]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)

In [59]:
model=rf.fit(X_train,y_train)

In [60]:
sorted(zip(model.feature_importances_, X_train.columns),reverse=True)[0:10]

[(0.03780461160266209, 'free'),
 (0.03733625427141856, 'call'),
 (0.03619974187498542, 'service'),
 (0.033208541249215066, 'txt'),
 (0.03265293723616012, 'mobile'),
 (0.02037359654459649, 'prize'),
 (0.019106504966506274, 'claim'),
 (0.018131117702049074, '16'),
 (0.01610313052717539, 'stop'),
 (0.014452256715403249, 'reply')]

In [87]:
y_pred = model.predict(X_test)

In [88]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))

Accuracy: 0.9246636771300448
Recall: 0.46496815286624205
Precision: 1.0


100% precision means when our model is detecting that given text is spam then it 100% of the time correct that the message is spam(all mail in your spam are actually spam), 49.36% recall means 49.36 of all spam that has came into email was properly placed into spam folder (not a good result), accuracy 92.82% means all the messages you've received are correctly identified as spam or not spam 

### Random Forest classifier with startified k fold cross validation and Parameter tunning:

In [63]:
from sklearn.model_selection import StratifiedKFold

In [76]:
def train_RF(n_est, depth):
    fold=StratifiedKFold(n_splits=10,shuffle=True,random_state=1994)
    i=1
    for train_index, test_index in fold.split(X_train,y_train):
        accuracy=[]
        precision=[]
        recall=[]
        fscore=[]
        X_trainn, X_testt = X_train.iloc[train_index], X_train.iloc[test_index]
        y_trainn, y_testt = y_train.iloc[train_index], y_train.iloc[test_index]
        m=RandomForestClassifier(n_estimators = n_est, max_depth= depth,criterion = 'entropy')
        m.fit(X_trainn,y_trainn)
        y_pred = m.predict(X_testt)
        print("Accuracy: ",accuracy_score(y_testt,y_pred))
        print("Precision: ",precision_score(y_testt,y_pred))
        print("Recall: ",recall_score(y_testt,y_pred))
        print('F1 score:', f1_score(y_testt, y_pred))
        accuracy.append(accuracy_score(y_testt,y_pred))
        precision.append(precision_score(y_testt,y_pred))
        recall.append(recall_score(y_testt,y_pred))
        fscore.append(f1_score(y_testt, y_pred))
        print("-"*36)
    print("final accuracy:",np.mean(accuracy))
    print("final precision:",np.mean(precision))
    print("final Recall:",np.mean(recall))
    print("final fscore:",np.mean(fscore))
    print("-"*36)
    print("Est: {} / Depth: {} ------f1: {} / Accuracy: {} / Precision: {} / Recall: {} ".format(n_est,depth,round(np.mean(fscore),3),round(np.mean(accuracy),3),round(np.mean(precision),3),round(np.mean(recall),3)))
    print("-"*100)

In [77]:
for n_est in [10,50,100]:
    for depth in [10,20,30,None]:
        train_RF(n_est, depth)

Accuracy:  0.899103139013453
Precision:  1.0
Recall:  0.23728813559322035
F1 score: 0.3835616438356164
------------------------------------
Accuracy:  0.8878923766816144
Precision:  1.0
Recall:  0.15254237288135594
F1 score: 0.2647058823529412
------------------------------------
Accuracy:  0.8878923766816144
Precision:  1.0
Recall:  0.15254237288135594
F1 score: 0.2647058823529412
------------------------------------
Accuracy:  0.899103139013453
Precision:  1.0
Recall:  0.23728813559322035
F1 score: 0.3835616438356164
------------------------------------
Accuracy:  0.8878923766816144
Precision:  1.0
Recall:  0.15254237288135594
F1 score: 0.2647058823529412
------------------------------------
Accuracy:  0.905829596412556
Precision:  1.0
Recall:  0.288135593220339
F1 score: 0.4473684210526315
------------------------------------
Accuracy:  0.899103139013453
Precision:  1.0
Recall:  0.23728813559322035
F1 score: 0.3835616438356164
------------------------------------
Accuracy:  0.894382

Accuracy:  0.8943820224719101
Precision:  1.0
Recall:  0.2033898305084746
F1 score: 0.33802816901408456
------------------------------------
Accuracy:  0.8876404494382022
Precision:  1.0
Recall:  0.15254237288135594
F1 score: 0.2647058823529412
------------------------------------
final accuracy: 0.8876404494382022
final precision: 1.0
final Recall: 0.15254237288135594
final fscore: 0.2647058823529412
------------------------------------
Est: 50 / Depth: 10 ------f1: 0.265 / Accuracy: 0.888 / Precision: 1.0 / Recall: 0.153 
----------------------------------------------------------------------------------------------------
Accuracy:  0.9417040358744395
Precision:  1.0
Recall:  0.559322033898305
F1 score: 0.717391304347826
------------------------------------
Accuracy:  0.9327354260089686
Precision:  1.0
Recall:  0.4915254237288136
F1 score: 0.6590909090909091
------------------------------------
Accuracy:  0.9327354260089686
Precision:  1.0
Recall:  0.4915254237288136
F1 score: 0.65909

Accuracy:  0.9349775784753364
Precision:  1.0
Recall:  0.5084745762711864
F1 score: 0.6741573033707865
------------------------------------
Accuracy:  0.9327354260089686
Precision:  1.0
Recall:  0.4915254237288136
F1 score: 0.6590909090909091
------------------------------------
Accuracy:  0.9348314606741573
Precision:  1.0
Recall:  0.5084745762711864
F1 score: 0.6741573033707865
------------------------------------
Accuracy:  0.9438202247191011
Precision:  1.0
Recall:  0.576271186440678
F1 score: 0.7311827956989247
------------------------------------
Accuracy:  0.9258426966292135
Precision:  1.0
Recall:  0.4406779661016949
F1 score: 0.611764705882353
------------------------------------
final accuracy: 0.9258426966292135
final precision: 1.0
final Recall: 0.4406779661016949
final fscore: 0.611764705882353
------------------------------------
Est: 100 / Depth: 20 ------f1: 0.612 / Accuracy: 0.926 / Precision: 1.0 / Recall: 0.441 
-------------------------------------------------------

looking at the result we can say Est: 50 / Depth: None we're getting better results than other models i.e. f1 score: 0.865 / Accuracy: 0.969 / Precision: 1.0 / Recall: 0.763  , so this are the best parameters for our model.

In [89]:
# Random Forest Classifier
fold=StratifiedKFold(n_splits=10,shuffle=True,random_state=1994)
i=1
for train_index, test_index in fold.split(X_train,y_train):
    accuracy=[]
    precision=[]
    recall=[]
    fscore=[]
        
    X_trainn, X_testt = X_train.iloc[train_index], X_train.iloc[test_index]
    y_trainn, y_testt = y_train.iloc[train_index], y_train.iloc[test_index]
    m=RandomForestClassifier(n_estimators = 50, criterion = 'entropy')
    m.fit(X_trainn,y_trainn)
    preds=m.predict_proba(X_testt)[:,-1]
    y_pred = m.predict(X_testt)
    accuracy.append(accuracy_score(y_testt,y_pred))
    precision.append(precision_score(y_testt,y_pred))
    recall.append(recall_score(y_testt,y_pred))
    fscore.append(f1_score(y_testt, y_pred))
print("final accuracy on train:",np.mean(accuracy))
print("final precision on train:",np.mean(precision))
print("final Recall on train:",np.mean(recall))

final accuracy on train: 0.9752808988764045
final precision on train: 1.0
final Recall on train: 0.8135593220338984


In [90]:
y_pred = m.predict(X_test)

print('Accuracy on test:', accuracy_score(y_test, y_pred))
print('Precision on test:', precision_score(y_test, y_pred))
print('Recall on test:', recall_score(y_test, y_pred))

Accuracy on test: 0.9659192825112107
Precision on test: 1.0
Recall on test: 0.7579617834394905


100% precision means when our model is detecting that given text is spam then it is 100% correct that the message is spam(all mail in your spam box are actually spam ), 75.79% recall means 75.79% of all spam that has came into email was properly placed into spam box (better result than previous model), accuracy 96.50 means all the messages you've received, almost 97% of them are correctly identified as spam or not spam  