In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
import string

In [8]:
# Download stopwords if you haven't already
import nltk
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

try:
    string.punctuation
except LookupError:
    nltk.download('punkt')



In [7]:
import pandas as pd

# Updated and working dataset URL
data_url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"

# Read the data (note: it's tab-separated)
df = pd.read_csv(data_url, sep='\t', header=0, names=["label", "text"])

# Convert label to numerical values (ham = 0, spam = 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Show the first few rows
print(df.head())


   label                                               text
0      0                      Ok lar... Joking wif u oni...
1      1  Free entry in 2 a wkly comp to win FA Cup fina...
2      0  U dun say so early hor... U c already then say...
3      0  Nah I don't think he goes to usf, he lives aro...
4      1  FreeMsg Hey there darling it's been 3 week's n...


In [9]:
# 2. Data Exploration and Preprocessing
print("First few rows of the dataset:")
print(df.head())
print("\nClass distribution:")
print(df['label'].value_counts())

def process_text(text):
    '''Removes punctuation and stopwords'''
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.lower().split() if word not in stopwords.words('english')]

First few rows of the dataset:
   label                                               text
0      0                      Ok lar... Joking wif u oni...
1      1  Free entry in 2 a wkly comp to win FA Cup fina...
2      0  U dun say so early hor... U c already then say...
3      0  Nah I don't think he goes to usf, he lives aro...
4      1  FreeMsg Hey there darling it's been 3 week's n...

Class distribution:
label
0    4824
1     747
Name: count, dtype: int64


In [10]:
# 3. Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(analyzer=process_text)
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# 4. Model Selection and Training

# a) Multinomial Naive Bayes
mnb_model = MultinomialNB()
mnb_model.fit(X_train, y_train)
mnb_predictions = mnb_model.predict(X_test)
print("\n--- Multinomial Naive Bayes ---")
print("Accuracy:", accuracy_score(y_test, mnb_predictions))
print(classification_report(y_test, mnb_predictions))



--- Multinomial Naive Bayes ---
Accuracy: 0.967713004484305
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       955
           1       1.00      0.78      0.87       160

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [12]:
# b) Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
print("\n--- Logistic Regression ---")
print("Accuracy:", accuracy_score(y_test, lr_predictions))
print(classification_report(y_test, lr_predictions))


--- Logistic Regression ---
Accuracy: 0.9551569506726457
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       955
           1       0.97      0.71      0.82       160

    accuracy                           0.96      1115
   macro avg       0.96      0.85      0.90      1115
weighted avg       0.96      0.96      0.95      1115



In [13]:
# c) Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
print("\n--- Support Vector Machine ---")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))



--- Support Vector Machine ---
Accuracy: 0.9748878923766816
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       955
           1       0.99      0.84      0.91       160

    accuracy                           0.97      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.97      0.97      1115

