In [1]:
# -*- coding: utf-8 -*-
# Indentation: Jupyter Notebook

'''
NLP model selection
'''

__version__ = 1.0
__author__ = "Sourav Raj"
__author_email__ = "souravraj.iitbbs@gmail.com"


In [2]:
import pandas as pd
import re
import nltk
import string
%matplotlib inline

In [6]:
col_name=['label', 'body_text']
data = pd.read_csv('../../SMSSpamCollection.tsv', sep='\t', names=col_name)
data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [7]:
stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()

In [8]:
def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(float(count)/(len(text)-text.count(' ')), 3)*100

In [9]:
data['body_len']=data['body_text'].apply(lambda x:len(x)-x.count(' '))
data['punct%'] =data['body_text'].apply(lambda x:count_punct(x))
data.head()

Unnamed: 0,label,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [10]:
data['body_len']=data['body_text'].apply(lambda x:len(x)-x.count(' '))
data['punct%'] =data['body_text'].apply(lambda x:count_punct(x))
data.head()

Unnamed: 0,label,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In this we first split the train, test then applying vectorizer

In [11]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test=train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

In [19]:
def clean_text(text):
    text=''.join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W', text)
    text=[ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit=tfidf_vect.fit(X_train['body_text'])
tfidf_train=tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test=tfidf_vect_fit.transform(X_test['body_text'])


In [26]:
X_train_vect=pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect=pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)
X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7108,7109,7110,7111,7112,7113,7114,7115,7116,7117
0,100,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,45,15.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,42.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,121,3.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [27]:
rf=RandomForestClassifier(n_jobs=-1, n_estimators=150, max_depth=None)

start=time.time()
rf_model=rf.fit(X_train_vect, y_train)
end=time.time()
fit_time=(end-start)

start=time.time()
y_pred=rf_model.predict(X_test_vect)
end=time.time()
pred_time=(end-start)

precision, recall, fscore, support=score(y_test, y_pred, pos_label='spam', average='binary')
accuracy=(y_pred==y_test).sum()/len(y_pred)
print('fit time: {} / pred time: {} /precision: {} / Recall:{}/ Accuracy:{}'.format(
    round(fit_time,3), round(pred_time,3), round(precision, 3), round(recall, 3), round(accuracy,3)))

fit time: 7.968 / pred time: 0.336 /precision: 1.0 / Recall:0.846/ Accuracy:0.0


In [28]:
gb=GradientBoostingClassifier( n_estimators=150, max_depth=11)

start=time.time()
gb_model=gb.fit(X_train_vect, y_train)
end=time.time()
fit_time=(end-start)

start=time.time()
y_pred=gb_model.predict(X_test_vect)
end=time.time()
pred_time=(end-start)

precision, recall, fscore, support=score(y_test, y_pred, pos_label='spam', average='binary')
accuracy=(y_pred==y_test).sum()/len(y_pred)
print('fit time: {} / pred time: {} /precision: {} / Recall:{}/ Accuracy:{}'.format(
    round(fit_time,3), round(pred_time,3), round(precision, 3), round(recall, 3), round(accuracy,3)))

fit time: 326.991 / pred time: 0.156 /precision: 0.948 / Recall:0.858/ Accuracy:0.0
