### Importing Libraries and Data

In [1]:
import nltk
#nltk.download()


In [2]:
import pandas as pd
dataNlp = pd.read_csv('SMSSpamCollection.tsv', sep='\t', names=['label','body_text'], header=None)
dataNlp.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [3]:
dataNlp['label'].value_counts()

ham     4822
spam     746
Name: label, dtype: int64

### Preprocessing Data

In [4]:
### Q1. Preprocess the data so that stopwords are removed
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopword = nltk.corpus.stopwords.words('english')
import string
import re
from nltk.tokenize import word_tokenize
wnl = nltk.WordNetLemmatizer()

def processData(txt):
    dataNlp['noPunct'] = txt.apply(lambda x: "".join([char.lower() for char in x if char not in string.punctuation]))
    #dataNlp['tokenized'] = dataNlp.noPunct.apply(lambda x: re.split('\W+', x))
    dataNlp['tokenized'] = dataNlp.noPunct.apply(lambda x: word_tokenize(x))
    dataNlp['noStopwords'] = dataNlp.tokenized.apply(lambda x: [word for word in x if word not in stopword])
    dataNlp['lemmatized'] = dataNlp.noStopwords.apply(lambda x: [wnl.lemmatize(word) for word in x])
    return dataNlp.lemmatized
    
processData(dataNlp.body_text)
dataNlp['body_len'] = dataNlp['body_text'].apply(lambda x: len(x) - x.count(" "))
dataNlp.head()

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,label,body_text,noPunct,tokenized,noStopwords,lemmatized,body_len
0,ham,I've been searching for the right words to tha...,ive been searching for the right words to than...,"[ive, been, searching, for, the, right, words,...","[ive, searching, right, words, thank, breather...","[ive, searching, right, word, thank, breather,...",160
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...",128
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]",49
3,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...",62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]",28


In [5]:
### Q2. Write down a function that can count percentage of punctuation marks in the text

def countPunct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

dataNlp['punct%'] = dataNlp.body_text.apply(lambda x: countPunct(x))
dataNlp.head()

Unnamed: 0,label,body_text,noPunct,tokenized,noStopwords,lemmatized,body_len,punct%
0,ham,I've been searching for the right words to tha...,ive been searching for the right words to than...,"[ive, been, searching, for, the, right, words,...","[ive, searching, right, words, thank, breather...","[ive, searching, right, word, thank, breather,...",160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...",128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]",49,4.1
3,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...",62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]",28,7.1


In [6]:
cleanNlp = dataNlp[['label','body_text','body_len','punct%']]
cleanNlp.head()

Unnamed: 0,label,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


### Split into train/test

In [7]:
cleanNlp.drop(['body_text'],axis=1)

Unnamed: 0,label,body_len,punct%
0,ham,160,2.5
1,spam,128,4.7
2,ham,49,4.1
3,ham,62,3.2
4,ham,28,7.1
...,...,...,...
5563,spam,131,6.1
5564,ham,29,3.4
5565,ham,48,14.6
5566,ham,100,1.0


In [8]:
dataNlp.columns

Index(['label', 'body_text', 'noPunct', 'tokenized', 'noStopwords',
       'lemmatized', 'body_len', 'punct%'],
      dtype='object')

In [9]:
cleanNlp

Unnamed: 0,label,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1
...,...,...,...,...
5563,spam,This is the 2nd time we have tried 2 contact u...,131,6.1
5564,ham,Will ü b going to esplanade fr home?,29,3.4
5565,ham,"Pity, * was in mood for that. So...any other s...",48,14.6
5566,ham,The guy did some bitching but I acted like i'd...,100,1.0


In [10]:
## Q4. For the test and train datasets create a TfIdfVectorizer
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    #noStop = [word for word in tokens if word not in stopword]
    #lem = [wnl.lemmatize(word) for word in noStop]
    lem = [ps.stem(word) for word in tokens if word not in stopword]
    return lem

from sklearn.feature_extraction.text import TfidfVectorizer
tfVect = TfidfVectorizer(analyzer=clean_text)
xtfTrain = tfVect.fit_transform(cleanNlp.body_text)
#xtfTest = tfVect.fit_transform(X_test)
X = pd.DataFrame(xtfTrain.toarray(), columns = tfVect.get_feature_names())
X

Unnamed: 0,Unnamed: 1,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
5564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335215,0.0,0.0
5565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
5566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [15]:
X4=pd.concat([cleanNlp[['label','body_len','punct%']], X], axis=1)
X4

Unnamed: 0,label,body_len,punct%,Unnamed: 4,0,008704050406,0089mi,0121,01223585236,01223585334,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,ham,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,spam,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,ham,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,ham,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,ham,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,spam,131,6.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
5564,ham,29,3.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335215,0.0,0.0
5565,ham,48,14.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
5566,ham,100,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [16]:
### Q3. Split the whole data set into training and test datasets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X4.drop(['label'],axis=1), X4.label, test_size=0.2, random_state=0) 

In [22]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
param = {'n_estimators': [10, 15],# 30],
        'max_depth': [30, 40]}#, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search

In [23]:
gs_fit = gs.fit(X_train, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,1.848871,0.052885,0.1795,0.002275,40,15,"{'max_depth': 40, 'n_estimators': 15}",0.964126,0.969731,0.961798,0.970787,0.960674,0.965424,0.004115,1
2,2.253623,0.882352,0.185099,0.009771,40,10,"{'max_depth': 40, 'n_estimators': 10}",0.959641,0.970852,0.965169,0.962921,0.965169,0.964751,0.003664,2
1,1.960176,0.040513,0.194725,0.007475,30,15,"{'max_depth': 30, 'n_estimators': 15}",0.959641,0.960762,0.959551,0.960674,0.964045,0.960934,0.001634,3
0,1.841823,0.052184,0.220372,0.040212,30,10,"{'max_depth': 30, 'n_estimators': 10}",0.956278,0.964126,0.958427,0.961798,0.962921,0.960709,0.00292,4



### Vectorize text

In [None]:
## Q4. For the test and train datasets create a TfIdfVectorizer
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    #noStop = [word for word in tokens if word not in stopword]
    #lem = [wnl.lemmatize(word) for word in noStop]
    lem = [ps.stem(word) for word in tokens if word not in stopword]
    return lem

from sklearn.feature_extraction.text import TfidfVectorizer
tfVect = TfidfVectorizer(analyzer=clean_text)
xtfTrain = tfVect.fit_transform(X_train)
xtfTest = tfVect.fit_transform(X_test)
xtfTrain.shape

In [None]:
xtfTrainDf = pd.concat([dataNlp['body_len'], dataNlp['punct%'], pd.DataFrame(xtfTrain.toarray())], axis=1)

In [None]:
xtfTestDf = pd.concat([dataNlp['body_len'], dataNlp['punct%'], pd.DataFrame(xtfTest.toarray())], axis=1)

In [None]:
xtfTestDf.head()

### Final evaluation of models

In [None]:
## Q5. Create a Ensemble classifier that can predict if the given Text is a Spam or a Ham 

In [None]:
## Q6. Evaluate the performance of your model using confusion matrix