Imported the neccessary libraries.

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re 
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate ,KFold, cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

Set the stopwords to the stopwords in the english language.

In [2]:
stopwords = stopwords.words('english')

Imported the data from a tab separated file containing the different emails marked as spam or ham and instantiated an object from the PorterStemmer library.

In [3]:
data = pd.read_csv('SMSSpamCollection.tsv',sep='\t',header=None)
data.columns = ['label', 'body_text']
pn = PorterStemmer()

Created a function which will take in the text in the email and remove all punctuation, splits each word and removes whitespace, stems each word and removes all stopwords from the text in each email.

In [11]:
def clean_text(text: list) -> list:
    text = ''.join([i for i in text if i not in string.punctuation])
    tokenize = re.split('\W+',string=text)
    text = ' '.join([pn.stem(i) for i in tokenize if i not in stopwords])
    return text

Creates new column which applies the function above to the body_text column in the dataset.


In [5]:

data['clean_data'] = data['body_text'].apply(lambda x : clean_text(x))

Shows the new dataset

In [6]:
data.head()

Unnamed: 0,label,body_text,clean_data
0,ham,I've been searching for the right words to tha...,ive search right word thank breather i promis ...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think goe usf live around though
3,ham,Even my brother is not like to speak with me. ...,even brother like speak they treat like aid pa...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will


In this cell two new additional columns are added to the dataset, a column showing the amount of characters in each email is shown and another column showing what percent is punctuation in the body_text column is displayed.

In [12]:
data['body_len'] = data['body_text'].apply(lambda x : len(x) - x.count(' '))

def punc(text: list) -> list:
    count = sum([1 for char in text if char in string.punctuation])
    return round((count / (len(text) - text.count(' '))),3) * 100 

data['punc_%'] = data['body_text'].apply(lambda x: punc(x))

The new data set is shown below with the addition of two columns.

In [8]:
data.head()

Unnamed: 0,label,body_text,clean_data,body_len,punc_%
0,ham,I've been searching for the right words to tha...,ive search right word thank breather i promis ...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think goe usf live around though,49,4.1
3,ham,Even my brother is not like to speak with me. ...,even brother like speak they treat like aid pa...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will,28,7.1


A count vectorizer was applied to the clean_data and this section stored in the X_counts variable, was concatenated with the body length and punctuation percentage column.

In [9]:
tf_vect = CountVectorizer(analyzer=clean_text)
X_counts = tf_vect.fit_transform(data['clean_data'])
x = pd.DataFrame(X_counts.toarray())
X_features = pd.concat([data['body_len'],data['punc_%'], x],axis=1)
X_features

Unnamed: 0,body_len,punc_%,0,1,2,3,4,5,6,7,...,34,35,36,37,38,39,40,41,42,43
0,160,2.5,15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,22,5,5,5,0,1,3,0,...,2,1,0,0,0,0,0,0,0,0
2,49,4.1,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.2,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,131,6.1,21,5,3,5,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
5564,29,3.4,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5565,48,14.6,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5566,100,1.0,13,0,0,0,0,0,0,0,...,1,2,0,0,0,0,0,0,0,0


A Random Forest Classifier model was applied to the features in the X_features column and label(spam vs ham). The 
classification report is shown below which gives different measurements on how reliable the model is. A grid search was employed in order to optimize the model. Different estimators were tested as well as different values for the depth.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

def train_RF(n_est: int,dep: int):
    rf_n = RandomForestClassifier(n_estimators=n_est, max_depth=dep,n_jobs=-1)
    model = rf_n.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    n = classification_report(y_test,y_pred)
    print(n)

for n_est in [10,50,100]:
    for dep in [10,20,30]:
        train_RF(n_est,dep)

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       964
        spam       0.94      0.91      0.93       150

    accuracy                           0.98      1114
   macro avg       0.97      0.95      0.96      1114
weighted avg       0.98      0.98      0.98      1114

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       964
        spam       0.98      0.89      0.93       150

    accuracy                           0.98      1114
   macro avg       0.98      0.94      0.96      1114
weighted avg       0.98      0.98      0.98      1114

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       964
        spam       0.98      0.88      0.93       150

    accuracy                           0.98      1114
   macro avg       0.98      0.94      0.96      1114
weighted avg       0.98      0.98      0.98      1114

              preci