# Building Machine Learning Classifiers: Random Forest

In [1]:
from __future__ import division

import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

### Read the data file and prepare columns

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMS_Spam_Collection.tsv", sep='\t')
data.columns = ['label', 'body_text']

### Punctuation percentage in body

In [3]:
def count_punct(text):
    count = len(re.findall(r"[" + string.punctuation + "]", text))
    return round(count/(len(text.strip())), 3)*100

### Clean the text

In [4]:
def clean_text(text):
    text = unicode(text, errors='ignore')
    text = re.sub(r"[" + string.punctuation + "]", '', text).lower()
    tokens = nltk.word_tokenize(text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return ''.join(text)

### Prepare TF-IDF vector and feature set

In [5]:
data['body_len'] = data['body_text'].apply(lambda x: len(x.strip()))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

# Prepare the important feature columns by concatenating on columns
X_Tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_features = pd.concat([data['body_len'], data['punct%'], X_Tfidf_df], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,27,28,29,30,31,32,33,34,35,36
0,155,3.9,0.357434,0.354501,0.331104,0.0,0.072811,0.225947,0.0,0.163438,...,0.098201,0.172102,0.091919,0.349997,0.068199,0.097783,0.07612,0.128023,0.04359,0.0
1,61,3.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.098366,0.105074,0.276984,0.350817,0.167667,0.0,0.0,0.0,0.0
2,77,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.248868,0.088613,0.389319,0.0,0.1414,0.0,0.0,0.0,0.0
3,35,5.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.265078,0.232922,0.295009,0.0,0.0,0.0,0.377118,0.0
4,160,3.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.129055,0.33926,0.161065,0.247671,0.358503,0.064253,0.0,0.0,0.0,0.0


### Explore RandomForestClassifier through Holdout Set

In [6]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

### Split the corpus in to train and test dataset

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

### Prepare the random forest model

In [8]:
from sklearn.ensemble import RandomForestClassifier

# 50 decision trees with depth upto 20
# parallel processing enabled as each Decision tree is independent of other
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

### Check which feature column has got highest decisive factor

In [9]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.23434137475129863, 0),
 (0.11476737580099897, 1),
 (0.08531077708944426, 8),
 (0.06846446454507553, 5),
 (0.06386142124259613, 6),
 (0.058090421728381206, 9),
 (0.05238152203839928, 7),
 (0.030551781174734952, 4),
 (0.026128132711978697, 'body_len'),
 (0.024480838696842446, 2)]

### Predict the output by using test dataset

In [10]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

In [11]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                         round(recall, 3),
                                                         round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.959 / Recall: 0.881 / Accuracy: 0.978
