In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [13]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
print("Total missing values in the dataset:\n", df.isnull().sum())

Total missing values in the dataset:
 Category    0
Message     0
dtype: int64


In [15]:
print("Total duplicates in the dataset : ", df.duplicated().sum())

Total duplicates in the dataset :  415


In [16]:
#remove Duplicates keeping only 1 occurence
df = df.drop_duplicates(keep = 'first')
print("Total duplicates in the dataset after removal of duplicates : ", df.duplicated().sum())

Total duplicates in the dataset after removal of duplicates :  0


In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(df.Message, df.Category, test_size = 0.2)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((4125,), (1032,), (4125,), (1032,))

In [19]:
v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4125x7631 sparse matrix of type '<class 'numpy.int64'>'
	with 54108 stored elements in Compressed Sparse Row format>

In [20]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
X_train_cv.shape

(4125, 7631)

In [22]:
v.get_feature_names_out()[1000:1050]

array(['approx', 'apps', 'appt', 'appy', 'april', 'aproach', 'apt',
       'aptitude', 'aquarius', 'ar', 'arab', 'arcade', 'archive', 'ard',
       'are', 'area', 'aren', 'arent', 'arestaurant', 'aretaking',
       'areyouunique', 'argentina', 'argh', 'argue', 'arguing',
       'argument', 'aries', 'arises', 'arithmetic', 'arm', 'armand',
       'arms', 'arng', 'arngd', 'arnt', 'around', 'arr', 'arrange',
       'arranging', 'arrested', 'arrive', 'arrow', 'arsenal', 'art',
       'artists', 'arts', 'arty', 'arul', 'as', 'asa'], dtype=object)

In [23]:
#Shows the vocabulary created from the dataset.
v.vocabulary_

{'hi': 3348,
 'did': 2246,
 'decide': 2147,
 'wot': 7493,
 'get': 3037,
 'his': 3362,
 'bday': 1229,
 'if': 3519,
 'not': 4750,
 'ill': 3530,
 'prob': 5351,
 'jus': 3794,
 'him': 3358,
 'voucher': 7216,
 'frm': 2930,
 'virgin': 7194,
 'or': 4892,
 'sumfing': 6502,
 'anytime': 969,
 'lor': 4126,
 'the': 6716,
 'most': 4508,
 'beautiful': 1237,
 'girl': 3057,
 'ive': 3693,
 'ever': 2602,
 'seen': 5907,
 'my': 4586,
 'baby': 1151,
 'come': 1856,
 'and': 929,
 'me': 4319,
 'in': 3558,
 'common': 1866,
 'room': 5737,
 'where': 7366,
 'is': 3668,
 'that': 6712,
 'one': 4863,
 'day': 2129,
 'training': 6908,
 'god': 3083,
 'asked': 1058,
 'what': 7357,
 'forgiveness': 2877,
 'little': 4070,
 'child': 1730,
 'gave': 3006,
 'lovely': 4148,
 'reply': 5638,
 'it': 3680,
 'wonderful': 7466,
 'fruit': 2945,
 'tree': 6931,
 'gives': 3064,
 'when': 7363,
 'being': 1269,
 'hurt': 3480,
 'by': 1546,
 'stone': 6404,
 'good': 3101,
 'night': 4697,
 'then': 6729,
 'dun': 2432,
 'wear': 7306,
 'jeans': 372

In [24]:
model = MultinomialNB()
model.fit(X_train_cv, Y_train)

In [28]:
X_test_cv = v.transform(X_test)

In [29]:
Y_pred = model.predict(X_test_cv)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       894
        spam       0.99      0.97      0.98       138

    accuracy                           0.99      1032
   macro avg       0.99      0.98      0.99      1032
weighted avg       0.99      0.99      0.99      1032

