# Spam Ham Classification using SVM

### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns

### Data loading and understanding

In [2]:
data = pd.read_csv('Spam.csv')
data.shape

(4601, 58)

In [3]:
data.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_hash,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
# checking null values

data.isnull().mean() * 100

word_freq_make                0.0
word_freq_address             0.0
word_freq_all                 0.0
word_freq_3d                  0.0
word_freq_our                 0.0
word_freq_over                0.0
word_freq_remove              0.0
word_freq_internet            0.0
word_freq_order               0.0
word_freq_mail                0.0
word_freq_receive             0.0
word_freq_will                0.0
word_freq_people              0.0
word_freq_report              0.0
word_freq_addresses           0.0
word_freq_free                0.0
word_freq_business            0.0
word_freq_email               0.0
word_freq_you                 0.0
word_freq_credit              0.0
word_freq_your                0.0
word_freq_font                0.0
word_freq_000                 0.0
word_freq_money               0.0
word_freq_hp                  0.0
word_freq_hpl                 0.0
word_freq_george              0.0
word_freq_650                 0.0
word_freq_lab                 0.0
word_freq_labs

In [5]:
# checking data types

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 

In [6]:
# checking distribution of spam class in the dataset and it is ~40%, so we are good to go

len(data[data['spam'] == 1]) / data.shape[0] * 100

39.404477287546186

### Data preparation

In [7]:
# split data into train and test with 70-30 rule

train_data, test_data = train_test_split(data, train_size=0.7, test_size=0.3, random_state=100)

In [8]:
# rescale the features

scaler = MinMaxScaler()
train_data[train_data.columns] = scaler.fit_transform(train_data[train_data.columns])

In [9]:
test_data[test_data.columns] = scaler.transform(test_data[test_data.columns])

In [10]:
X_train = train_data.drop(columns = 'spam', axis = 1)
y_train = train_data['spam']

In [11]:
X_test = test_data.drop(columns= 'spam', axis = 1)
y_test = test_data['spam']

In [12]:
folds = KFold(n_splits=5, random_state=100, shuffle=True)
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

### Model building using cross validation technique

In [13]:
# specify params
params = {"C": [0.1, 1, 10, 100, 1000]}

# specify scores/metrics in an iterable
scores = ['accuracy', 'precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for {}".format(score))
    
    # set up GridSearch for score metric
    clf = GridSearchCV(SVC(), 
                       params, 
                       cv=folds,
                       scoring=score,
                       return_train_score=True)
    # fit
    clf.fit(X_train, y_train)

    print(" The highest {0} score is {1} at C = {2}".format(score, clf.best_score_, clf.best_params_))
    print("\n")

# Tuning hyper-parameters for accuracy
 The highest accuracy score is 0.9254658385093167 at C = {'C': 10}


# Tuning hyper-parameters for precision
 The highest precision score is 0.9253049406055499 at C = {'C': 1}


# Tuning hyper-parameters for recall
 The highest recall score is 0.8897487926569149 at C = {'C': 100}




In [14]:
# build a support vector classifier with C = 10

svc = SVC(C = 10)
svc.fit(X_train, y_train)

### Prediction on training set

In [15]:
y_train_pred = svc.predict(X_train)

In [16]:
accuracy_score(y_train, y_train_pred)

0.9670807453416149

In [17]:
mat = confusion_matrix(y_train, y_train_pred)

In [18]:
mat

array([[1948,   29],
       [  77, 1166]], dtype=int64)

In [19]:
ham_ham = mat[0][0]
ham_spam = mat[0][1]

spam_ham = mat[1][0]
spam_spam = mat[1][1]

In [20]:
senitivity = spam_spam / (spam_ham + spam_spam)
print(senitivity)

0.9380530973451328


In [21]:
recall_score(y_train, y_train_pred)

0.9380530973451328

In [22]:
specificity = ham_ham / (ham_spam + ham_ham)
print(specificity)

0.9853313100657562


In [23]:
precision = spam_spam / (spam_spam + ham_spam)
print(precision)

0.9757322175732217


In [24]:
precision_score(y_train, y_train_pred)

0.9757322175732217

### Prediction on testing set

In [25]:
y_test_pred = svc.predict(X_test)

In [26]:
accuracy_score(y_test, y_test_pred)

0.9312092686459088

In [27]:
mat = confusion_matrix(y_test, y_test_pred)

In [28]:
mat

array([[773,  38],
       [ 57, 513]], dtype=int64)

In [29]:
ham_ham = mat[0][0]
ham_spam = mat[0][1]

spam_ham = mat[1][0]
spam_spam = mat[1][1]

In [30]:
senitivity = spam_spam / (spam_ham + spam_spam)
print(senitivity)

0.9


In [31]:
recall_score(y_test, y_test_pred)

0.9

In [32]:
specificity = ham_ham / (ham_spam + ham_ham)
print(specificity)

0.9531442663378545


In [33]:
precision = spam_spam / (spam_spam + ham_spam)
print(precision)

0.9310344827586207


In [34]:
precision_score(y_test, y_test_pred)

0.9310344827586207