<h1>Kaggle Ham or Spam</h1>

<h3>Import required libraries</h3>

In [31]:
import pandas as pd
import numpy as np
import nltk
import sklearn
import sklearn.tree
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raghav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<h3> Load and clean up data </h3>

In [9]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df = df.replace(['ham', 'spam'], [0, 1])
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df['Count'] = 0
for i in np.arange(0, len(df.v2)):
    df.loc[i, 'Count'] = len(df.loc[i, 'v2'])
df.head()

Unnamed: 0,v1,v2,Count
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [13]:
# Total ham and spam messages
# Ham = 0
# Spam = 1
df['v1'].value_counts()

0    4825
1     747
Name: v1, dtype: int64

<h3>Processing Messages</h3>

For each message...

* Replace email addresses with 'emailaddr'

* Replace URLs with 'httpaddr'

* Replace $ and £ with 'moneysymbl'

* Replace phone numbers with 'phonenumbr'

* Replace numbers with 'numbr'

* Remove punctuation

* Convert to lowercase

* Append to corpus

In [20]:
corpus = []
ps = nltk.stem.porter.PorterStemmer()

for i in range(0, 5572):
    msg = df['v2'][i]
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', df['v2'][i])
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', df['v2'][i])
    msg = re.sub('£|\$', 'moneysymb', df['v2'][i])
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', df['v2'][i])
    msg = re.sub('\d+(\.\d+)?', 'numbr', df['v2'][i])
    msg = re.sub('[^\w\d\s]', ' ', df['v2'][i])

    msg = msg.lower()    
    msg = msg.split()
    
    msg = [ps.stem(word) for word in msg if not word in set(nltk.corpus.stopwords.words('english'))]

    msg = ' '.join(msg)
    
    corpus.append(msg)

In [22]:
cv = sklearn.feature_extraction.text.CountVectorizer()
x = cv.fit_transform(corpus).toarray()

<h3>Apply Decision Tree classification</h3>

In [25]:
y = df['v1']
le = sklearn.preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [29]:
xtrain, xtest, ytrain, ytest = sklearn.model_selection.train_test_split(x, y, test_size=0.20, random_state = 1)

In [32]:
dt = sklearn.tree.DecisionTreeClassifier(random_state=80)
dt.fit(xtrain, ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=80, splitter='best')

In [33]:
# Predicting using decision tree
pred_dt = dt.predict(xtest)

<h3> Result </h3>

In [35]:
print(sklearn.metrics.confusion_matrix(ytest, pred_dt))

[[963  13]
 [  9 130]]


In [47]:
accuracy = sklearn.metrics.accuracy_score(ytest, dt.predict(xtest))
accuracy *= 100
print("Accuracy = %0.2f" % accuracy + '%')

Accuracy = 98.03%


In [49]:
print(sklearn.metrics.classification_report(ytest, dt.predict(xtest)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       976
           1       0.91      0.94      0.92       139

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



<h2> In conclusion...</h2>

Final decision tree accuracy = <b>98.03%</b>