<a href="https://colab.research.google.com/github/savanigit/OIBSIP/blob/task-4-submission/Task_4_Spam_Email_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

print("Libraries imported!")

Libraries imported!


In [5]:
# Try loading with specific encoding to avoid errors
try:
    df = pd.read_csv('spam.csv', encoding='latin-1')
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Error: File not found. Make sure you uploaded 'spam.csv'")

# Let's verify the columns
df.head()

Data loaded successfully!


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# 1. Drop the garbage columns
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

# 2. Rename v1 and v2 to something meaningful
df.rename(columns={'v1': 'Category', 'v2': 'Message'}, inplace=True)

# 3. Convert 'Category' to numbers
# spam = 1 (Bad), ham = 0 (Good)
df['Spam_Indicator'] = df['Category'].map({'spam': 1, 'ham': 0})

# Let's check the clean table
df.head()

Unnamed: 0,Category,Message,Spam_Indicator
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# 1. Separate Text (X) and Label (y)
X = df['Message']
y = df['Spam_Indicator']

# 2. Split into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Create the Vectorizer (The dictionary builder)
vectorizer = CountVectorizer()

# 4. Teach the dictionary (Fit) and Transform the text to numbers
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test) # Only transform the test data, don't learn from it

print("Text converted to numbers successfully!")
print("Shape of X_train:", X_train_vectorized.shape)

Text converted to numbers successfully!
Shape of X_train: (4457, 7735)


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Create the Model
model = MultinomialNB()

# 2. Train it (using the Counted Words)
model.fit(X_train_vectorized, y_train)

# 3. Predict on the Test Set
y_pred = model.predict(X_test_vectorized)

# 4. Check the Score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred))

Accuracy: 0.9838565022421525

Confusion Matrix:
 [[963   2]
 [ 16 134]]

Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [9]:
# 1. Write your own emails
email_1 = ["Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123 to claim now."]
email_2 = ["Hey, are we still meeting for lunch tomorrow? Let me know."]

# 2. Convert them to numbers (Use the SAME vectorizer you trained)
email_1_vec = vectorizer.transform(email_1)
email_2_vec = vectorizer.transform(email_2)

# 3. Predict
print("Email 1 Prediction:", model.predict(email_1_vec)[0]) # Should be 1 (Spam)
print("Email 2 Prediction:", model.predict(email_2_vec)[0]) # Should be 0 (Ham)

Email 1 Prediction: 1
Email 2 Prediction: 0


In [10]:
import pickle

# Save both files
with open('spam_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Both Model and Vectorizer saved successfully!")

Both Model and Vectorizer saved successfully!
