In [19]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [20]:
import pandas as pd
import kagglehub
import os
# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

#print("Path to dataset files:", path)
csv_path = os.path.join(path, "spam.csv")
df = pd.read_csv(csv_path,encoding="latin-1")

print(df.shape)
print(df.head())
print(df['v1'].value_counts())


(5572, 5)
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
v1
ham     4825
spam     747
Name: count, dtype: int64


In [21]:
print("First 5 rows of the dataframe:")
print(df.head())

print("\nColumn names and their data types:")
print(df.info())

print("\nMissing values in each column:")
print(df.isnull().sum())

First 5 rows of the dataframe:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

Column names and their data types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null    

In [22]:
# Drop the columns with a high number of missing values
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

# Rename the columns for better readability
df.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

print("DataFrame after dropping columns and renaming:")
display(df.head())

DataFrame after dropping columns and renaming:


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
import re
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r'http\S+|www\S+', '', s)        # remove urls
    s = re.sub(r'[^a-z0-9\s]', ' ', s)         # keep letters & numbers
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [24]:
df['message'] = df['message'].apply(clean_text)
df.head()

Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...


In [25]:
X = df['message']
y = df['label'].map({'ham':0, 'spam':1})
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
model = Pipeline([
    ('features', TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words='english')),
    ('classifier', LogisticRegression(max_iter=2000))
])

In [27]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       0.98      0.74      0.85       149

    accuracy                           0.96      1115
   macro avg       0.97      0.87      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Confusion Matrix:
 [[964   2]
 [ 38 111]]
