# Natural Language Processing
# Machine Learning - Spam Filter Classifier


## Reading in text

In [1]:
#imports and read in of data

import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()


## Helper functions to clean text and count punctuation per message

In [2]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

In [3]:
#function removes puntuation, stopwords; tokenizes and stems words (reduces them to common root word )

def clean_text(text):
    #removes punctuation
    text ="".join([word.lower() for word in text if word not in string.punctuation])
    #splits out each wod
    tokens = re.split('\W+', text)
    #removes stopwords
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    #return cleaned text
    return text


## Cleaning text, counting punctuation and displaying data

In [4]:
data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['lable', 'body_text']

data['body_len'] = data['body_text'].apply(lambda x: len(x) -  x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))
data.head()

Unnamed: 0,lable,body_text,body_len,punct%
0,ham,Ok lar... Joking wif u oni...,24,25.0
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,128,4.7
2,ham,U dun say so early hor... U c already then say...,39,15.4
3,ham,"Nah I don't think he goes to usf, he lives around here though",49,4.1
4,spam,FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...,116,6.9


Split into train and test sets

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['lable'], test_size=0.2)

## Vectorzing text

In [6]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7169,7170,7171,7172,7173,7174,7175,7176,7177,7178
0,63,9.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,106,12.3,0.149379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,59,1.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,128,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,32,3.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Evaluation of Random Forest Classifier and Braidient Boosting Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [8]:
## Random Forest

rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

#start timing model fit
start = time.time()

#fitting model
rf_model = rf.fit(X_train_vect, y_train)

#end fit timing
end = time.time()
fit_time = (end - start)

#start prediction timing
start = time.time()

#making prediction
y_pred = rf_model.predict(X_test_vect)

#end time taken to make prediction
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average= 'binary')
print("fit time: {} / Predict time: {} / Presision: {} / Recall: {} / Accuracy: {}".format(
        round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

fit time: 3.538 / Predict time: 0.253 / Presision: 1.0 / Recall: 0.767 / Accuracy: 0.969


In [10]:
## Gradient Boosting Classifier

gb = GradientBoostingClassifier(n_estimators=150, max_depth=None)

#start timing model fit
start = time.time()

#fitting model
gb_model = gb.fit(X_train_vect, y_train)

#end fit timing
end = time.time()
fit_time = (end - start)

#start prediction timing
start = time.time()

#making prediction
y_pred = gb_model.predict(X_test_vect)

#end time taken to make prediction
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average= 'binary')
print("fit time: {} / Predict time: {} / Presision: {} / Recall: {} / Accuracy: {}".format(
        round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

fit time: 321.636 / Predict time: 0.165 / Presision: 0.917 / Recall: 0.813 / Accuracy: 0.965


## Either of the above models would serve adequately, however, Gradient Boosting takes considerably longer to train.