In [1]:
# -*- coding: utf-8 -*-
# Indentation: Jupyter Notebook

'''
Random Forest Model
'''

__version__ = 1.0
__author__ = "Sourav Raj"
__author_email__ = "souravraj.iitbbs@gmail.com"


In [2]:
import pandas as pd
import re
import nltk
import string
%matplotlib inline

In [3]:
col_name=['label', 'body_text']
data = pd.read_csv('../../data/SMSSpamCollection.tsv', sep='\t', names=col_name)
data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [4]:
stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()

In [5]:
def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(float(count)/(len(text)-text.count(' ')), 3)*100

In [6]:
data['body_len']=data['body_text'].apply(lambda x:len(x)-x.count(' '))
data['punct%'] =data['body_text'].apply(lambda x:count_punct(x))
data.head()

Unnamed: 0,label,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [9]:
def clean_text(text):
    text=''.join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W', text)
    text=[ps.stem(word) for word in tokens if word not in stopwords]
    return text
   

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['body_text'])

In [11]:
X_features=pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Model building

In [16]:
from sklearn.ensemble import RandomForestClassifier
(RandomForestClassifier())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
print(dir(RandomForestClassifier))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_estimator_type', '_get_param_names', '_make_estimator', '_set_oob_score', '_validate_X_predict', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']


RF with cross validation

In [18]:
from sklearn.model_selection import KFold, cross_val_score

In [19]:
rf=RandomForestClassifier(n_jobs=-1)
kfold=KFold(n_splits=5)
cross_val_score(rf, X_features, data['label'], cv=kfold, scoring='accuracy', n_jobs=-1)

array([ 0.96588869,  0.96947935,  0.97217235,  0.96226415,  0.96585804])

RF with holdout set

In [24]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test=train_test_split(X_features, data['label'], test_size=0.2)

In [26]:
rf=RandomForestClassifier(n_jobs=-1, n_estimators=50, max_depth=20)
rf_model=rf.fit(X_train, y_train)

In [28]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.049339399848869231, 'body_len'),
 (0.044535186752572023, 7353),
 (0.0421307878317164, 4799),
 (0.036175444133139584, 3135),
 (0.027556941739472839, 2032),
 (0.022189830668703888, 6288),
 (0.01873246487300528, 5991),
 (0.01583623995796745, 1804),
 (0.01535315261182919, 5727),
 (0.014632474227928684, 6749),
 (0.013863047100451968, 7464),
 (0.01317112173968152, 354),
 (0.01316629830191905, 5081),
 (0.012298263638021532, 3446),
 (0.0118661504039338, 7030),
 (0.011050271886957898, 0),
 (0.010857848409310644, 1361),
 (0.010698098238936771, 392),
 (0.010559153537368466, 7593),
 (0.010407989608483034, 1882),
 (0.0098873061516185964, 2082),
 (0.0098849259648414161, 2096),
 (0.0094071013999592161, 2620),
 (0.0093425490801285621, 2172),
 (0.0091383802690012863, 294),
 (0.0087286205732194592, 'punct%'),
 (0.008025908967688131, 397),
 (0.0079123787388168283, 7221),
 (0.007394498849739195, 5878),
 (0.0073787298870239479, 375),
 (0.0072571993879945321, 295),
 (0.0068885531702146952, 690),
 (0.006

In [29]:
y_pred=rf_model.predict(X_test)
precision, recall, fscore, support=score(y_test, y_pred, pos_label='spam', average='binary')

In [30]:
accuracy=(y_pred==y_test).sum()/len(y_pred)

In [32]:
print('precision: {} / Recall:{}/ Accuracy:{}'.format(
    round(precision, 3), round(recall, 3), round(accuracy,3)))

precision: 1.0 / Recall:0.65/ Accuracy:0.95


precision =1 means whatever  we predicted as spam are actually spam

Recall =0.65 means 65% of total spam are figured out rest are comes to y
inbox which means model is not that great

# Grid Search

To optimize the hyper parameters of RF further, we use grid search

In [33]:
# to optimize no of tree (n_estimators), depth of tree(max_depth)
def train_RF(n_est, depth):
    rf=RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model=rf.fit(X_train, y_train)
    y_pred=rf_model.predict(X_test)
    precision, recall, fscore, support=score(y_test, y_pred, pos_label='spam', average='binary')
    accuracy=(y_pred==y_test).sum()/len(y_pred)
    print('Est:{} / Depth:{} ---- precision: {} / Recall:{}/ Accuracy:{}'.format(
        n_est, depth, round(precision, 3), round(recall, 3), round(accuracy,3)))

In [34]:
for n_est in [10,50,100]:
    for depth in [10,20,30,None]:
        train_RF(n_est,depth)   

Est:10 / Depth:10 ---- precision: 1.0 / Recall:0.231/ Accuracy:0.89
Est:10 / Depth:20 ---- precision: 1.0 / Recall:0.606/ Accuracy:0.943
Est:10 / Depth:30 ---- precision: 0.992 / Recall:0.731/ Accuracy:0.961
Est:10 / Depth:None ---- precision: 0.977 / Recall:0.812/ Accuracy:0.97
Est:50 / Depth:10 ---- precision: 1.0 / Recall:0.256/ Accuracy:0.893
Est:50 / Depth:20 ---- precision: 1.0 / Recall:0.644/ Accuracy:0.949
Est:50 / Depth:30 ---- precision: 1.0 / Recall:0.725/ Accuracy:0.961
Est:50 / Depth:None ---- precision: 0.993 / Recall:0.862/ Accuracy:0.979
Est:100 / Depth:10 ---- precision: 1.0 / Recall:0.225/ Accuracy:0.889
Est:100 / Depth:20 ---- precision: 1.0 / Recall:0.662/ Accuracy:0.952
Est:100 / Depth:30 ---- precision: 1.0 / Recall:0.738/ Accuracy:0.962
Est:100 / Depth:None ---- precision: 0.993 / Recall:0.838/ Accuracy:0.976


As we increase max depth Recall rate is increaing significantly but est doesn't have that much impact.
In general as we increase max_depth value ,model will be better. 

# GridSearchCV
Combining GridSeach with cross validation to make model more robust

we also check which vectorizer method work better TFIDF

or count_vectorizer  using gridsearch CV

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['body_text'])
X_tfidf_features=pd.concat([data['body_len'], data['punct%'], 
                            pd.DataFrame(X_tfidf.toarray())], axis=1)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count=count_vect.fit_transform(data['body_text'])
X_count_features=pd.concat([data['body_len'], data['punct%'], 
                            pd.DataFrame(X_tfidf.toarray())], axis=1)

In [39]:
from sklearn.model_selection import GridSearchCV

In [None]:
rf=RandomForestClassifier()
param={'n_estimators':[10, 150,300],
      'max_depth':[30,60,90, None]}
gs=GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit=gs.fit(X_tfidf_features, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

In [None]:
rf=RandomForestClassifier()
param={'n_estimators':[10, 150,300],
      'max_depth':[30,60,90, None]}
gs=GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit=gs.fit(X_tfidf_count, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]