# Building Machine Learning Classifiers: Random Forest on a holdout test set

### Read in & clean text

In [15]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

#TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

#Count Vectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Explore RandomForestClassifier through Holdout Set

In [16]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split, GridSearchCV

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [4]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, 
                            n_estimators=50,
                            max_depth=20
                           )
rf_model = rf.fit(X_train, y_train)

In [5]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.05199014537757954, 1803),
 (0.04090862832384518, 'body_len'),
 (0.03366259982109352, 3134),
 (0.02317670205873484, 7350),
 (0.022976561813771886, 4796),
 (0.021577449153920965, 5724),
 (0.017855200989950297, 2031),
 (0.017608404529609704, 6746),
 (0.017399681400886158, 7782),
 (0.016451760352450998, 2171),
 (0.01550982909218497, 1361),
 (0.014165421141110328, 397),
 (0.013944865658424272, 295),
 (0.012491788420677528, 690),
 (0.010939566006462571, 392),
 (0.010861086701957824, 5988),
 (0.010834128792908564, 436),
 (0.010626087980274514, 7543),
 (0.010613676886963361, 2299),
 (0.010583847087089872, 6285),
 (0.010553973015936908, 5453),
 (0.010042824688850573, 3443),
 (0.00945980953128109, 354),
 (0.009332151670672934, 294),
 (0.008971483030548576, 7218),
 (0.008664347120273343, 611),
 (0.00860909047881913, 5078),
 (0.008513668570590233, 7590),
 (0.008277837484125502, 1941),
 (0.008189672997723265, 4269),
 (0.007609300380908793, 7461),
 (0.0073615669869203605, 7027),
 (0.006994545572

In [6]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

In [7]:
print ('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                          round(recall, 3),
                                                          round((y_pred==y_test).sum()/len(y_pred), 3)
                                                          ))

Precision: 1.0 / Recall: 0.525 / Accuracy: 0.933


Implementing Grid Search Manually

In [13]:
def train_RF(n_est, depth):
    rf1 = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf1_model = rf1.fit(X_train, y_train)
    y_pred = rf1_model.predict(X_test)
    precision, recall, fscore, support= score(y_test, y_pred, pos_label='spam', average='binary')
    print ('Est.: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, 
        depth, 
        round(precision, 3),
        round(recall, 3),
        round((y_pred==y_test).sum()/len(y_pred), 3)
    ))

In [14]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est.: 10 / Depth: 10 ---- Precision: 1.0 / Recall: 0.228 / Accuracy: 0.89
Est.: 10 / Depth: 20 ---- Precision: 1.0 / Recall: 0.589 / Accuracy: 0.942
Est.: 10 / Depth: 30 ---- Precision: 1.0 / Recall: 0.671 / Accuracy: 0.953
Est.: 10 / Depth: None ---- Precision: 0.992 / Recall: 0.791 / Accuracy: 0.969
Est.: 50 / Depth: 10 ---- Precision: 1.0 / Recall: 0.209 / Accuracy: 0.888
Est.: 50 / Depth: 20 ---- Precision: 1.0 / Recall: 0.551 / Accuracy: 0.936
Est.: 50 / Depth: 30 ---- Precision: 1.0 / Recall: 0.658 / Accuracy: 0.952
Est.: 50 / Depth: None ---- Precision: 1.0 / Recall: 0.772 / Accuracy: 0.968
Est.: 100 / Depth: 10 ---- Precision: 1.0 / Recall: 0.215 / Accuracy: 0.889
Est.: 100 / Depth: 20 ---- Precision: 1.0 / Recall: 0.532 / Accuracy: 0.934
Est.: 100 / Depth: 30 ---- Precision: 1.0 / Recall: 0.671 / Accuracy: 0.953
Est.: 100 / Depth: None ---- Precision: 1.0 / Recall: 0.791 / Accuracy: 0.97


Applying GridSearchCV for hyperparameter tuning

In [19]:
rf = RandomForestClassifier()
param = {'n_estimators': [10,150,300],
         'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_n_estimators,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
11,34.605725,0.459516,0.973954,1.0,,300,"{'max_depth': None, 'n_estimators': 300}",1,0.976682,1.0,...,0.977538,1.0,0.968553,1.0,0.973046,1.0,4.894188,0.116247,0.003171,0.0
8,37.25083,0.606872,0.973594,0.999326,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",2,0.979372,0.999551,...,0.972147,0.999102,0.968553,0.999551,0.971249,0.999102,1.64517,0.077617,0.003891,0.000201
7,21.215421,0.39424,0.973415,0.999237,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",3,0.980269,0.999326,...,0.972147,0.999326,0.97035,0.999326,0.97035,0.999102,0.617557,0.039848,0.00368,0.00011
5,33.029916,0.620599,0.972337,0.993982,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",4,0.978475,0.994385,...,0.971249,0.995061,0.967655,0.993938,0.969452,0.993938,0.876449,0.132181,0.003884,0.000808
10,21.307609,0.470395,0.972337,1.0,,150,"{'max_depth': None, 'n_estimators': 150}",4,0.977578,1.0,...,0.973944,1.0,0.966757,1.0,0.971249,1.0,0.631862,0.075433,0.003534,0.0


In [20]:
rf = RandomForestClassifier()
param = {'n_estimators': [10,150,300],
         'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_n_estimators,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
8,40.642415,0.668842,0.975031,0.999326,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",1,0.979372,0.999326,...,0.975741,0.999102,0.969452,0.999775,0.975741,0.998877,2.420369,0.071214,0.003195,0.000318
7,22.887506,0.527546,0.973954,0.999282,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",2,0.978475,0.999551,...,0.973944,0.999551,0.966757,0.998877,0.973046,0.999102,1.76857,0.202324,0.004145,0.000262
11,45.44195,0.519678,0.973954,1.0,,300,"{'max_depth': None, 'n_estimators': 300}",2,0.976682,1.0,...,0.975741,1.0,0.966757,1.0,0.972147,1.0,9.475327,0.146532,0.004142,0.0
10,25.808788,0.633799,0.973594,1.0,,150,"{'max_depth': None, 'n_estimators': 150}",4,0.976682,1.0,...,0.977538,1.0,0.968553,1.0,0.972147,1.0,0.541772,0.204343,0.003252,0.0
4,15.324336,0.340244,0.971978,0.993623,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",5,0.975785,0.994609,...,0.971249,0.993714,0.967655,0.993489,0.97035,0.993938,0.389972,0.054211,0.002986,0.000732


In [21]:
from sklearn.ensemble import GradientBoostingClassifier

** Build our own Grid-Search ** 

In [27]:
def train_GB(n_est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support= score(y_test, y_pred, pos_label='spam', average='binary')
    print ('Est.: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, 
        max_depth, 
        lr,
        round(precision, 3),
        round(recall, 3),
        round((y_pred==y_test).sum()/len(y_pred), 3)
    ))

In [28]:
for n_est in [50, 100, 150]:
    for max_depth in [3,7,11,15]:
        for lr in [0.1, 1]:
            train_GB(n_est, depth, lr)

Est.: 50 / Depth: None / LR: 0.1 ---- Precision: 0.906 / Recall: 0.851 / Accuracy: 0.969
Est.: 50 / Depth: None / LR: 1 ---- Precision: 0.901 / Recall: 0.858 / Accuracy: 0.969


KeyboardInterrupt: 