In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
data = pd.read_csv("Navy Bays_SMSSpamCollection.csv",  sep='\t', names=['label', 'message'])

In [4]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [6]:
X = data['message']
y = data['label']

In [22]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64

In [7]:
vectorizer = CountVectorizer(stop_words='english')
X_vect = vectorizer.fit_transform(X)

In [8]:
X_vect

<5572x8444 sparse matrix of type '<class 'numpy.int64'>'
	with 43578 stored elements in Compressed Sparse Row format>

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

In [18]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

In [15]:
y_pred_nb

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [11]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [16]:
y_pred_lr

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:

print("Accuracy for Logistic regression:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall   :", recall_score(y_test, y_pred_lr))
print("F1 Score :", f1_score(y_test, y_pred_lr))

Accuracy for Logistic regression: 0.9856502242152466
Precision: 1.0
Recall   : 0.8926174496644296
F1 Score : 0.9432624113475178


In [21]:
cv_nb = cross_val_score(nb, X_vect, y, cv=5, scoring='accuracy').mean()
cv_lr = cross_val_score(lr, X_vect, y, cv=5, scoring='accuracy').mean()

print(f"\nAverage CV Accuracy (Naive Bayes): {cv_nb:.4f}")
print(f"Average CV Accuracy (Logistic Regression): {cv_lr:.4f}")


Average CV Accuracy (Naive Bayes): 0.9794
Average CV Accuracy (Logistic Regression): 0.9788
