# importing the dataset

we downloaded the dataset from kaggle

https://www.kaggle.com/uciml/sms-spam-collection-dataset?select=spam.csv

we will use this dataset to classify whether an sms is a spam or ham

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
plt.style.use("seaborn-pastel")
sns.set_theme(palette="dark",style="dark" )

In [None]:
#change the encoding to utf-8 using ms excel first
#or use  ---> encoding='latin1' <--- as an argument in the below line of code
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", index_col=0, usecols=[0,1])

In [None]:
df

In [None]:
df.index.name = "class"

In [None]:
df.columns = ["Text"]

In [None]:
df.columns

In [None]:
df.index.value_counts()

#we need to stratifically divide our train test data.

In [None]:
4825/(4825+747)
#86.5% ham
#13.5% spam

In [None]:
sns.barplot(x=df.index.unique(), y=df.index.value_counts())

# seperating the feature and target column

In [None]:
# this is the first step we need to perform before splitting the dataset

x, y = df.iloc[:,0], df.index

In [None]:
x = pd.DataFrame(x)

In [None]:
y = pd.DataFrame(y)

In [None]:
y.head(2)

In [None]:
x.index = y.index

In [None]:
x.head(2)

# train test splitting

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y )

In [None]:
y_train.value_counts()

In [None]:
3859/4457

In [None]:
y_test.value_counts()

In [None]:
966/(966+149)

we stratified our splitting using the y df(target column) that we had. we cross checked the ratio, and it's exactly what we needed

In [None]:
x_train

# vectorisation 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

x_train_series = x_train.squeeze()
lst = x_train_series.tolist()
cv = CountVectorizer(input = lst,
        lowercase=True,
        stop_words='english' #for removing stop words
                     )

In [None]:
x_train_prepared = cv.fit_transform(lst)

In [None]:
x_train_prepared.shape

In [None]:
lst_test = (x_test.squeeze()).tolist()
x_test_prepared = cv.transform(lst_test)

In [None]:
x_test_prepared

In [None]:
y_train = (np.array(y_train)).ravel()

# training the model

In [None]:
from sklearn.svm import SVC
svm_clf = SVC()

svm_clf.fit(x_train_prepared, y_train)

# performance measure

In [None]:
some_data = x_train_prepared[47:53]
some_labels = y_train[47:53]

In [None]:
print("predictions:", svm_clf.predict(some_data))
print("labels:", some_labels)

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_predict = cross_val_predict(svm_clf, x_train_prepared, np.array(y_train).ravel(), cv=3)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score
cm = confusion_matrix(y_train, y_train_predict)
print(cm)

In [None]:
f1_score(y_train, y_train_predict, average='weighted')

In [None]:
print(classification_report(y_train, y_train_predict))

In [None]:
accuracy_score(y_train, y_train_predict)

In [None]:

#heatmap for confusion matrix
group_names = ['True Neg','False Pos','False Neg','True Pos']

labels = np.asarray(group_names).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap='PuRd')

# accuracy on test dataset

In [None]:
y_test_predict = svm_clf.predict(x_test_prepared)

In [None]:
accuracy_score(y_test, y_test_predict)

In [None]:
print("predictions:", y_test_predict[:5])
print("actual labels:", y_test[:5])

In [None]:
y_test_predict = cross_val_predict(svm_clf, x_test_prepared, np.array(y_test).ravel())
cm_test = confusion_matrix(y_test, y_test_predict)
print(cm_test)

In [None]:
f1_score(y_test, y_test_predict, average='weighted')

In [None]:
print(classification_report(y_test, y_test_predict))

In [None]:

#heatmap for confusion matrix
group_names = ['True Neg','False Pos','False Neg','True Pos']

labels = np.asarray(group_names).reshape(2,2)
sns.heatmap(cm_test, annot=labels, fmt='', cmap='PuRd')