In [2]:
#Loading the dataset

import pandas as pd
import numpy as np

df = pd.read_csv('E:/datasets/sms_spam.csv')
df

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...
...,...,...
5554,ham,You are a great role model. You are giving so ...
5555,ham,"Awesome, I remember the last time we got someb..."
5556,spam,"If you don't, your prize will go to another cu..."
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn..."


In [3]:
# assigning labels to the dataset, 0 for ham and 1 for spam

df['label_num'] = df['type'].apply(lambda x: 1 if x == 'spam' else 0)

In [4]:
#splitting the dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_num'], test_size=0.3, random_state=42)

In [5]:
# Vectorizing the data or making some changes to the data for the model

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

lst = X_train.tolist()
vectorizer = TfidfVectorizer(input = lst, lowercase = True, stop_words = "english")
train_transformed = vectorizer.fit_transform(X_train)
test_transformed = vectorizer.transform(X_test)

model = joblib.load('E:/DS_and_ML/Classification/SpamClassifier.pkl')
model.fit(train_transformed, y_train)

MultinomialNB()

In [6]:
prediction = model.predict(test_transformed[:40])
actual = y_test[:40]

print("Prediction:", list(prediction))
print("Actual:    ",list(actual))

Prediction: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Actual:     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [7]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(prediction, actual)
matrix

array([[37,  1],
       [ 0,  2]], dtype=int64)

In [8]:
precision = matrix[1][1]/(matrix[1][1]+matrix[0][1])
recall = matrix[1][1]/(matrix[1][1]+matrix[1][0])
f1score = matrix[1][1]/(matrix[1][1]+(matrix[1][0]+(matrix[0][1]/2)))

print(precision)
print(recall)
print(f1score)

0.6666666666666666
1.0
0.8


In [9]:
message = ["Congragulations! You have won a $10,000. Go to https://bit.ly/23343 to claim now.",
          "Hey dude! how is it going?",
          "Have you completed your notes?"
          ]

message_transformed = vectorizer.transform(message)

new_prediction = model.predict(message_transformed)

for i in range(len(new_prediction)):
    if new_prediction[i] == 0:
        print("Its a Ham message.")
    else:
        print("Alert! it's a Spam message!")

Alert! it's a Spam message!
Its a Ham message.
Its a Ham message.
