### Importing Libraries and Data

In [1]:
import nltk
#nltk.download()


In [2]:
import pandas as pd
dataNlp = pd.read_csv('SMSSpamCollection.tsv', sep='\t', names=['label','body_text'], header=None)
dataNlp.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [3]:
dataNlp['label'].value_counts()

ham     4822
spam     746
Name: label, dtype: int64

### Preprocessing Data

In [4]:
### Q1. Preprocess the data so that stopwords are removed
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopword = nltk.corpus.stopwords.words('english')
import string
import re
from nltk.tokenize import word_tokenize
wnl = nltk.WordNetLemmatizer()

def processData(txt):
    dataNlp['noPunct'] = txt.apply(lambda x: "".join([char.lower() for char in x if char not in string.punctuation]))
    #dataNlp['tokenized'] = dataNlp.noPunct.apply(lambda x: re.split('\W+', x))
    dataNlp['tokenized'] = dataNlp.noPunct.apply(lambda x: word_tokenize(x))
    dataNlp['noStopwords'] = dataNlp.tokenized.apply(lambda x: [word for word in x if word not in stopword])
    dataNlp['lemmatized'] = dataNlp.noStopwords.apply(lambda x: [wnl.lemmatize(word) for word in x])
    return dataNlp.lemmatized
    
processData(dataNlp.body_text)
dataNlp['body_len'] = dataNlp['body_text'].apply(lambda x: len(x) - x.count(" "))
dataNlp.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\comaq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\comaq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\comaq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,label,body_text,noPunct,tokenized,noStopwords,lemmatized,body_len
0,ham,I've been searching for the right words to tha...,ive been searching for the right words to than...,"[ive, been, searching, for, the, right, words,...","[ive, searching, right, words, thank, breather...","[ive, searching, right, word, thank, breather,...",160
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...",128
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]",49
3,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...",62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]",28


In [5]:
### Q2. Write down a function that can count percentage of punctuation marks in the text

def countPunct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

dataNlp['punct%'] = dataNlp.body_text.apply(lambda x: countPunct(x))
dataNlp.head()

Unnamed: 0,label,body_text,noPunct,tokenized,noStopwords,lemmatized,body_len,punct%
0,ham,I've been searching for the right words to tha...,ive been searching for the right words to than...,"[ive, been, searching, for, the, right, words,...","[ive, searching, right, words, thank, breather...","[ive, searching, right, word, thank, breather,...",160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...",128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]",49,4.1
3,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...",62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]",28,7.1


In [6]:
cleanNlp = dataNlp[['label','lemmatized']]
cleanNlp.head()

Unnamed: 0,label,lemmatized
0,ham,"[ive, searching, right, word, thank, breather,..."
1,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
2,ham,"[nah, dont, think, go, usf, life, around, though]"
3,ham,"[even, brother, like, speak, treat, like, aid,..."
4,ham,"[date, sunday]"


### Split into train/test

In [7]:
### Q3. Split the whole data set into training and test datasets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataNlp.body_text, dataNlp.label, test_size=0.2, random_state=0) 


### Vectorize text

In [8]:
## Q4. For the test and train datasets create a TfIdfVectorizer
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    #noStop = [word for word in tokens if word not in stopword]
    #lem = [wnl.lemmatize(word) for word in noStop]
    lem = [ps.stem(word) for word in tokens if word not in stopword]
    return lem

from sklearn.feature_extraction.text import TfidfVectorizer
tfVect = TfidfVectorizer(analyzer=clean_text)
xtfTrain = tfVect.fit_transform(X_train)
xtfTest = tfVect.fit_transform(X_test)
xtfTrain.shape

(4454, 7129)

In [9]:
xtfTrainDf = pd.concat([dataNlp['body_len'], dataNlp['punct%'], pd.DataFrame(xtfTrain.toarray())], axis=1)

In [10]:
xtfTestDf = pd.concat([dataNlp['body_len'], dataNlp['punct%'], pd.DataFrame(xtfTest.toarray())], axis=1)

In [11]:
xtfTestDf.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,3167,3168,3169,3170,3171,3172,3173,3174,3175,3176
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.141908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Final evaluation of models

In [None]:
## Q5. Create a Ensemble classifier that can predict if the given Text is a Spam or a Ham 

In [13]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(xtfTrainDf, dataNlp['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [12]:
## Q6. Evaluate the performance of your model using confusion matrix