In [1]:
#Import Required Libraries

import pandas as pd
import requests
import io
from zipfile import ZipFile
import pickle

In [2]:
# Download the SMS Spam Collection dataset from the UCI Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
r = requests.get(url)
z = ZipFile(io.BytesIO(r.content))

In [3]:
# Read the dataset file into a Pandas DataFrame
filename = 'SMSSpamCollection'
with z.open(filename) as file:
    df = pd.read_csv(file, sep='\t', header=None, names=['v1', 'v2'])

In [4]:
# Convert labels to binary values
df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})

In [5]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=42)

In [6]:
# Build a text classification pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

In [7]:
# Train the model on the training set
text_clf.fit(X_train, y_train)

In [9]:
# Evaluate the model on the testing set
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = text_clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))

Accuracy: 0.9919282511210762
Confusion matrix:
 [[966   0]
 [  9 140]]
Classification report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [10]:
# Save the trained model as a pickle file Most imp
with open('spam_classifier.pkl', 'wb') as file:
    pickle.dump(text_clf, file)