In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Load data
data = pd.read_csv("spam.csv")

# Display the first few rows of the dataframe
print(data.head())

# Map the 'Category' to numerical values
data['cat'] = data['Category'].map({'ham': 1, 'spam': 0})

# Define features and target variable
X = data[["Message"]]
y = data["cat"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Column transformer for preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

preprocessing = ColumnTransformer([
    ("text_feature_extraction", CountVectorizer(), "Message")
])

# Pipeline for classification
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ("preprocessing", preprocessing),
    ("classification", MultinomialNB())
])

# Fit the model
clf.fit(X_train, y_train)

# Evaluate the model
score = clf.score(X_test, y_test)
print(f"Accuracy: {score}")

# Predict and evaluate other metrics
y_pred = clf.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Confusion Matrix:\n{conf_matrix}")


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
Accuracy: 0.9919282511210762
Precision: 0.9907692307692307
Recall: 1.0
Confusion Matrix:
[[140   9]
 [  0 966]]
