In [115]:
import pandas as pd

In [84]:
df = pd.read_csv(
    '/kaggle/input/sms-spam-collection-dataset/spam.csv',
    encoding='latin-1'
)


In [85]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [86]:
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

**Imbalanced data**

In [87]:
df.drop(columns=['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [88]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [89]:
df.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)


In [90]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [91]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


In [92]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   int64 
 1   text    5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [94]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [95]:
df.duplicated().sum()

403

In [96]:
df.drop_duplicates(inplace=True)

In [97]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."




TEXT PREPROCESSING

In [98]:
import re 
import nltk
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

In [99]:
def clean(text):
    text = text.lower()                                          # lowercase
    text = re.sub(r'<.*?>', ' ', text)                           # HTML tags, space
    text = re.sub(r'http\S+|www\S+', ' ', text)                  # URLs
    text = re.sub(r'\S+@\S+', ' ', text)                         # email addresses 
    text = re.sub(r'[^a-z0-9\s]', ' ', text)                     # only keeping numbers letter and space 
    text = re.sub(r'\s+', ' ', text).strip()                    #converting multiple spaces into single space
    return text

In [100]:
df['text']=df['text'].apply(clean)

In [101]:
import nltk
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

In [102]:
def stop_word(x):
    text=str.split(x)
    filtered=[word for word in text if word not in stop_words]
    return " ".join(filtered)

In [103]:
df['text']=df['text'].apply(stop_word)

In [104]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lem_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lem_words)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [105]:
df['text']=df['text'].apply(lemmatize_text)

In [106]:
X=df['text']
y=df['label']

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report,confusion_matrix

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [109]:
vectorizer = CountVectorizer(stop_words='english', lowercase=True)

In [110]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [111]:
model = MultinomialNB(alpha=0.2)
model.fit(X_train_vec, y_train)

In [112]:
y_pred = model.predict(X_test_vec)

In [113]:
print("=== FINAL RESULTS ===")
print(f"Accuracy   : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score   : {f1_score(y_test, y_pred):.4f}")

print("\ndetailed Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

print("\nconfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

=== FINAL RESULTS ===
Accuracy   : 0.9836
Precision  : 0.9385
Recall     : 0.9313
F1-Score   : 0.9349

detailed Report:
              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       903
        Spam       0.94      0.93      0.93       131

    accuracy                           0.98      1034
   macro avg       0.96      0.96      0.96      1034
weighted avg       0.98      0.98      0.98      1034


confusion Matrix:
[[895   8]
 [  9 122]]


**During experimentation, i trained multiple models using different machine-learning algorithms and various feature-extraction techniques. after evaluating all combinations, i found that multinomial naive bayes + bag of words (countvectorizer) provided the best performance.

my main priority was to improve recall, because in an imbalanced dataset—especially for spam detection—low recall can be dangerous. a model with high precision but low recall may miss many actual spam messages, which is unacceptable for real-world use.

while some algorithms achieved 100% precision, their recall dropped to around 80%, making them unreliable. instead of focusing on overall accuracy (which can be misleading on imbalanced data), i optimized for recall to ensure the model captures as many true spam messages as possible.

ultimately, the multinomialnb + bag-of-words combination delivered the best balance, achieving high recall, high precision, and strong overall performance on this dataset..**