In [1]:
import pandas as pd
import nltk
from matplotlib import pyplot as plt

url = 'spam_data.csv'
df = pd.read_csv(url)
df.head(2)

Unnamed: 0,text,label
0,Please call me when you're free Rahul.,ham
1,Win a free Laptop today! Offer code Vk2ql.,spam


In [2]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [3]:
df.duplicated().sum()


np.int64(0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    366 non-null    object
 1   label   366 non-null    object
dtypes: object(2)
memory usage: 5.8+ KB


In [5]:
df.head(2)

Unnamed: 0,text,label
0,Please call me when you're free Rahul.,ham
1,Win a free Laptop today! Offer code Vk2ql.,spam


<h3 style="text-align: center; color: blue;">Text Preprocessing</h3>


In [6]:
from nltk.tokenize import word_tokenize

lowercased_text = df['text'].str.lower()

df['tokens'] = lowercased_text.apply(word_tokenize)
df.head(2)

Unnamed: 0,text,label,tokens
0,Please call me when you're free Rahul.,ham,"[please, call, me, when, you, 're, free, rahul..."
1,Win a free Laptop today! Offer code Vk2ql.,spam,"[win, a, free, laptop, today, !, offer, code, ..."


In [7]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
df['filtered_words'] = df['tokens'].apply(lambda words : [word for word in words if word.isalpha() and word not in stop_words])

df.head(2)

Unnamed: 0,text,label,tokens,filtered_words
0,Please call me when you're free Rahul.,ham,"[please, call, me, when, you, 're, free, rahul...","[please, call, free, rahul]"
1,Win a free Laptop today! Offer code Vk2ql.,spam,"[win, a, free, laptop, today, !, offer, code, ...","[win, free, laptop, today, offer, code]"


In [8]:
from nltk.stem import WordNetLemmatizer

lm = WordNetLemmatizer()
df['lemmatized_words'] = df['filtered_words'].apply(lambda words:[lm.lemmatize(word) for word in words])
df.head(2)

Unnamed: 0,text,label,tokens,filtered_words,lemmatized_words
0,Please call me when you're free Rahul.,ham,"[please, call, me, when, you, 're, free, rahul...","[please, call, free, rahul]","[please, call, free, rahul]"
1,Win a free Laptop today! Offer code Vk2ql.,spam,"[win, a, free, laptop, today, !, offer, code, ...","[win, free, laptop, today, offer, code]","[win, free, laptop, today, offer, code]"


In [9]:
df['label'] = df['label'].replace({'spam':1, 'ham':0})
df.head(2)

  df['label'] = df['label'].replace({'spam':1, 'ham':0})


Unnamed: 0,text,label,tokens,filtered_words,lemmatized_words
0,Please call me when you're free Rahul.,0,"[please, call, me, when, you, 're, free, rahul...","[please, call, free, rahul]","[please, call, free, rahul]"
1,Win a free Laptop today! Offer code Vk2ql.,1,"[win, a, free, laptop, today, !, offer, code, ...","[win, free, laptop, today, offer, code]","[win, free, laptop, today, offer, code]"


## splitting the dataset

In [10]:
X = df.drop(['text','label','tokens','filtered_words'], axis=1)
y = df['label']
X.head(2)

Unnamed: 0,lemmatized_words
0,"[please, call, free, rahul]"
1,"[win, free, laptop, today, offer, code]"


In [11]:
X['lemmatized_words'] = X['lemmatized_words'].apply(lambda words: " ".join(words))
X.head(2)


Unnamed: 0,lemmatized_words
0,please call free rahul
1,win free laptop today offer code


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape

(274, 1)

<h3 style="text-align: center; color: blue;">Vectorization</h3>


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train['lemmatized_words'])   
X_test_tfidf  = vectorizer.transform(X_test['lemmatized_words'])


<h3 style="text-align: center; color: blue;">Model Training</h3>


In [14]:
X_train_tfidf.shape, y_train.shape


((274, 130), (274,))

In [15]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [16]:
##prediction
y_pred = model_nb.predict(X_test_tfidf)

<h3 style="text-align: center; color: blue;">Accuracy Analysis</h3>


In [17]:
from sklearn.metrics import classification_report

cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        46
           1       1.00      1.00      1.00        46

    accuracy                           1.00        92
   macro avg       1.00      1.00      1.00        92
weighted avg       1.00      1.00      1.00        92



<h3 style="text-align: center; color: blue;">Conclusion</h3>


The evaluation metrics show perfect scores across precision, recall, F1-score, and accuracy (all 1.00).
While this appears ideal, such results often indicate that the dataset is small and well-separated, meaning the spam and non-spam examples are 
easily distinguishable. In this case, the model can classify correctly without much difficulty.