In [1]:
# -*- coding: utf-8 -*-
# Indentation: Jupyter Notebook

'''
Gradient boosting with NLP
'''

__version__ = 1.0
__author__ = "Sourav Raj"
__author_email__ = "souravraj.iitbbs@gmail.com"


In [3]:
import pandas as pd
import re
import nltk
import string
%matplotlib inline

In [2]:
col_name=['label', 'body_text']
data = pd.read_csv('../../data/SMSSpamCollection.tsv', sep='\t', names=col_name)
data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [3]:
stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()

In [4]:
def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(float(count)/(len(text)-text.count(' ')), 3)*100

In [5]:
data['body_len']=data['body_text'].apply(lambda x:len(x)-x.count(' '))
data['punct%'] =data['body_text'].apply(lambda x:count_punct(x))
data.head()

Unnamed: 0,label,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [6]:
def clean_text(text):
    text=''.join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W', text)
    text=[ps.stem(word) for word in tokens if word not in stopwords]
    return text
   

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['body_text'])

In [8]:
X_features=pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8091,8092,8093,8094,8095,8096,8097,8098,8099,8100
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.ensemble import GradientBoostingClassifier

In [10]:
print(dir(GradientBoostingClassifier))

['_SUPPORTED_LOSS', '__abstractmethods__', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getstate__', '__hash__', '__init__', '__iter__', '__len__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_check_initialized', '_check_params', '_clear_state', '_decision_function', '_estimator_type', '_fit_stage', '_fit_stages', '_get_param_names', '_init_decision_function', '_init_state', '_is_initialized', '_make_estimator', '_resize_state', '_staged_decision_function', '_validate_estimator', '_validate_y', 'apply', 'decision_function', 'feature_importances_', 'fit', 'get_params', 'n_features', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params', 'staged_decision_function', 'staged_predict', 'staged_predict_proba']


In [11]:
print(GradientBoostingClassifier())

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


here learning_rate=0.1 determine how fast model will run

# Grid Search

In [12]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test=train_test_split(X_features, data['label'], test_size=0.2)

In [14]:
def train_GB(est, depth, lr):
    gb=GradientBoostingClassifier(n_estimators=est, max_depth=depth, learning_rate=lr)
    gb_model=gb.fit(X_train, y_train)
    y_pred=gb_model.predict(X_test)
    precision, recall, fscore, support=score(y_test, y_pred, pos_label='spam', average='binary')
    accuracy=(y_pred==y_test).sum()/len(y_pred)
    print('Est:{} / Depth:{} ---- precision: {} / Recall:{}/ Accuracy:{}'.format(
        n_est, depth, round(precision, 3), round(recall, 3), round(accuracy,3)))

In [None]:
for n_est in [50,100,150]:
    for depth in [3,7,11,15]:
        for lr in [0.01,0.1,1]:
            train_GB(n_est, depth, lr)

# Grid Search CV

Combining GridSeach with cross validation to make model more robust

we also check which vectorizer method work better TFIDF

or count_vectorizer using gridsearch CV


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['body_text'])
X_tfidf_features=pd.concat([data['body_len'], data['punct%'], 
                            pd.DataFrame(X_tfidf.toarray())], axis=1)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count=count_vect.fit_transform(data['body_text'])
X_count_features=pd.concat([data['body_len'], data['punct%'], 
                            pd.DataFrame(X_tfidf.toarray())], axis=1)

In [17]:
from sklearn.model_selection import GridSearchCV

In [None]:
gb=GradientBoostingClassifier()
param={'n_estimators':[100, 150,300],
      'max_depth':[7, 11, 15],
      'learning_rate':[0.1]}
gs=GridSearchCV(gb, param, cv=5, n_jobs=-1)
gs_fit=gs.fit(X_tfidf_features, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

In [None]:
gb=GradientBoostingClassifier()
param={'n_estimators':[100, 150,300],
      'max_depth':[7, 11, 15],
      'learning_rate':[0.1]}
gs=GridSearchCV(gb, param, cv=5, n_jobs=-1)
gs_fit=gs.fit(X_count_features, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]