In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

Loading and preprocessing the data

In [9]:
data = pd.read_csv('./data/spam.csv', usecols=[0, 1], names=['type', 'sms'], skiprows=1)
data['sms'] = data['sms'].str.lower()
data['type'] = data['type'].map({'ham': 0, 'spam': 1})
data

Unnamed: 0,type,sms
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,1,this is the 2nd time we have tried 2 contact u...
5568,0,will �_ b going to esplanade fr home?
5569,0,"pity, * was in mood for that. so...any other s..."
5570,0,the guy did some bitching but i acted like i'd...


Convert text to features using TF-IDF

In [10]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['sms'])
y = data['type']

Split the data into training and testing sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train and evaluate the model

In [12]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

Train and evaluate Naive Bayes

In [13]:
nb_model = MultinomialNB()
nb_accuracy, nb_report = train_and_evaluate(nb_model, X_train, y_train, X_test, y_test)
print("Naive Bayes - Accuracy:", nb_accuracy)
print("Naive Bayes - Classification Report:\n", nb_report)

Naive Bayes - Accuracy: 0.968609865470852
Naive Bayes - Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115



Train and evaluate Logistic Regression

In [14]:
lr_model = LogisticRegression(max_iter=1000)
lr_accuracy, lr_report = train_and_evaluate(lr_model, X_train, y_train, X_test, y_test)
print("Logistic Regression - Accuracy:", lr_accuracy)
print("Logistic Regression - Classification Report:\n", lr_report)

Logistic Regression - Accuracy: 0.9443946188340807
Logistic Regression - Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       0.97      0.61      0.75       150

    accuracy                           0.94      1115
   macro avg       0.96      0.80      0.86      1115
weighted avg       0.95      0.94      0.94      1115

