# Building Machine Learning Classifiers: Model selection

### Read in & clean text

In [1]:
#import needed libraries
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

#call in stopwords and stemmer
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

#read in the data
data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

In [2]:
#create a function that counts the number of punctuations in a string
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

#apply function to dataframe and create a column that has the % punctuations in a given text
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

#create a function that cleans the text in a given string
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

### Split into train/test

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

### Vectorize text

In [4]:
#create a tfidf vectorizer and use the clean_text function to clean the data
#then fit the tfidf vect object to the training data
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

#transform both sets of x_train and x_test to the tfidf vectorizer
tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

#concatenate body_len column and punct% column to the vectorized data
X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7129,7130,7131,7132,7133,7134,7135,7136,7137,7138
0,44,13.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,120,7.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,61,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30,3.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52,7.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Models and Models Evaluation

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

  from numpy.core.umath_tests import inner1d


In [9]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start_time = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end_time = time.time()
fit_time = (end_time - start_time)

start_time = time.time()
y_pred = rf_model.predict(X_test_vect)
end_time = time.time()
pred_time = (end_time - start_time)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 4.965 / Predict time: 0.162 ---- Precision: 1.0 / Recall: 0.821 / Accuracy: 0.978


In [10]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start_time = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end_time = time.time()
fit_time = (end_time - start_time)

start_time = time.time()
y_pred = gb_model.predict(X_test_vect)
end_time = time.time()
fit_time = (end_time - start_time)




precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 0.216 / Predict time: 0.162 ---- Precision: 0.944 / Recall: 0.85 / Accuracy: 0.975


Both models provided accuracy over 97% which is great. However, the time it takes for the model to fit to the data is drastically different. Predict time is exactly the same which then shows that there is a small trade off between the two models. Although it mighth ave taken longer to fit the model to the data, it may be slightly (0.3% more accurate) which in the end might make a big difference if this type of model were to go into production at a large scale. Another thing to notice is the precision and recall have a trade off from both models. The percision for the RFC is at 1.00 which means that everything it was targeted classified was correct and recall was at a 82.1% while the GBC was 94.4% on precision and 85% on recall. This may play a huge factor depending on the which model to use in the end if you would like you model to be more percise or better at generalizing. Overall, both models performed well and predict time is the exact same with minor trade-offs.