# Emails: Spam or not? NLP machine learning model

Importing all necessary libraries.

In [30]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score

Importing and cleaning the dataset

In [3]:
data = pd.read_csv("Emails_dataset.tsv", sep='\t')
data.columns = ['label', 'body_text']

In [4]:
data

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,ham,"Nah I don't think he goes to usf, he lives aro..."
2,ham,Even my brother is not like to speak with me. ...
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnamin...
...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...
5563,ham,Will ü b going to esplanade fr home?
5564,ham,"Pity, * was in mood for that. So...any other s..."
5565,ham,The guy did some bitching but I acted like i'd...


In [5]:
#For Stemming
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [12]:
#Creating a function to count the % of punctuation for creating an additional feature for the model
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

#Creating two ne features
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

In [13]:
#Function to clean the dataset
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

## Data model: To predict which email is spam and which one is not.

In [42]:
#Splitting the dataset into training and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.3)

In [43]:
#Vectorizing the data for training the model with more features
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,6620,6621,6622,6623,6624,6625,6626,6627,6628,6629
0,38,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,40,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,84,19.0,0.183507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34,8.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,67,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197524,0.0,0.0


### Building a Random Forest model 

In [44]:
#Running the random forest model on train dataset
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)
rf_model = rf.fit(X_train_vect, y_train)



In [48]:
#Predicting spam/ham using the rf model
y_pred = rf_model.predict(X_test_vect)
precision_rf, recall_rf, fscore_rf, support_rf = score(y_test, y_pred, pos_label='spam', average='binary')



In [49]:
#Displaying model metrics
print('Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision_rf, 3),
                                                        round(recall_rf, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.995 | Recall: 0.813 | Accuracy: 0.973


As it is observed from this model that the accuracy is about 97.3%. That's excellent accuracy but lets try another model to check if we can optimize it on precision. Since in this problem false positives are costlier we will try to get the optimal precision for the model.

### Building a Gradient boosting model

In [51]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)
gb_model = gb.fit(X_train_vect, y_train)



In [52]:
y_pred = gb_model.predict(X_test_vect)



In [53]:
#Displaying model metrics
precision_gb, recall_gb, fscore_gb, train_support_gb = score(y_test, y_pred, pos_label='spam', average='binary')
print('Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision_gb, 3),
                                                        round(recall_gb, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.908 | Recall: 0.796 | Accuracy: 0.96


By looking at the results of two models it shows that Random forest model is better in comaprision to gradient boost model. Although we can try a lot more combinations of the parameters and cross validation. 