In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer #used to word to vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('spam.tsv', sep='\t')
df.sample(3)

Unnamed: 0,label,message,length,punct
2301,ham,Nothin comes to my mind. Ü help me buy hanger ...,71,3
3777,ham,Ok lor. Msg me b4 u call.,25,2
5107,ham,I REALLY NEED 2 KISS U I MISS U MY BABY FROM U...,57,0


In [3]:
df['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [4]:
df.shape

(5572, 4)

In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
 2   length   5572 non-null   int64 
 3   punct    5572 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 174.2+ KB


In [7]:
df['label'].value_counts()
# highly imbalanced data\

label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
# separating ham rows and spam rows
ham = df[df['label'] == "ham"]
spam = df[df['label'] == "spam"]


In [9]:
spam.shape

(747, 4)

In [10]:
ham.shape

(4825, 4)

In [11]:
spam = spam.sample(ham.shape[0], replace=True)

In [12]:
spam.shape

(4825, 4)

In [13]:
data = pd.concat([ham, spam], ignore_index=True)

In [14]:
data.shape

(9650, 4)

In [15]:
data.sample(5)

Unnamed: 0,label,message,length,punct
6532,spam,I don't know u and u don't know me. Send CHAT ...,158,13
4663,ham,Ok.ok ok..then..whats ur todays plan,36,5
1885,ham,Yup i shd haf ard 10 pages if i add figures......,72,4
5935,spam,Spook up your mob with a Halloween collection ...,150,3
7949,spam,U can WIN £100 of Music Gift Vouchers every we...,159,5


In [16]:
x = data['message']
y = data['label']
# train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2, random_state=101)

In [17]:
# Vectorization -- Bag of words
vectorizer = CountVectorizer()

In [18]:
text = ["hello world", "world hello", "hello hello world"]
x = vectorizer.fit_transform(text)
xdf = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
xdf

Unnamed: 0,hello,world
0,1,1
1,1,1
2,2,1


In [19]:
xtrain_vec = vectorizer.fit_transform(xtrain)
xtest_vec = vectorizer.transform(xtest)

In [20]:
# Model building
clf = MultinomialNB()
classification_model = Pipeline([('vectorizer', vectorizer), ('Model', clf)])

In [21]:
classification_model.fit(xtrain, ytrain)

In [22]:
ypred_train = classification_model.predict(xtrain)
ypred_test = classification_model.predict(xtest)

In [23]:
print(classification_report(ytest, ypred_test))

              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       959
        spam       0.99      0.97      0.98       971

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930



In [24]:
print(confusion_matrix(ytest, ypred_test))

[[947  12]
 [ 26 945]]


In [25]:
# Single message prediction

In [26]:
test1 = ['Hi, hope you are doing well. Could you pls share the ML notes.']
test2 = ['Congratulations, You won a lottery ticket worth $1 Millon! To claim call @111111']
test3 = ['Hi for you promotion, give party']
test4 = ['I hope you all are enjoying with ML session']

In [27]:
classification_model.predict(test1)

array(['ham'], dtype='<U4')

In [28]:
classification_model.predict(test2)

array(['spam'], dtype='<U4')

In [29]:
classification_model.predict(test3)

array(['ham'], dtype='<U4')

In [31]:
classification_model.predict(test4)

array(['ham'], dtype='<U4')