<a href="https://colab.research.google.com/github/saanidhi-git/OIBSIP_datascience-4/blob/spam-colab-notebook/Email_spam_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

from google.colab import files
import os

# 1. Define folder name and expected CSV path
FOLDER_NAME = 'EMAIL SPAM DETECTION'
ZIP_FILE = 'email spam detection.zip'
DATASET_PATH = f'{FOLDER_NAME}/spam.csv'

# 2. Upload the zip file
print(f"Please upload your '{ZIP_FILE}' file now:")
uploaded = files.upload()

# 3. Unzip the file
if ZIP_FILE in uploaded:
    print(f"\nUnzipping the project folder '{ZIP_FILE}'...")
    # Use -o option to overwrite existing files if you re-run
    !unzip -o "{ZIP_FILE}"
else:
    print(f"\nERROR: Did not find '{ZIP_FILE}' in uploaded files.")
    print("Please ensure the zip file is uploaded correctly before proceeding.")

# 4. Verify file path (Optional)
print(f"\nVerifying if spam.csv is at: {DATASET_PATH}")
if os.path.exists(DATASET_PATH):
    print("Success! File found.")
else:
    print("ERROR: File not found at the expected path. Check your zip contents.")

Please upload your 'email spam detection.zip' file now:


Saving email spam detection.zip to email spam detection.zip

Unzipping the project folder 'email spam detection.zip'...
Archive:  email spam detection.zip
   creating: email spam detection/
  inflating: email spam detection/app.py  
  inflating: email spam detection/spam.csv  
  inflating: email spam detection/spam_detector.py  

Verifying if spam.csv is at: EMAIL SPAM DETECTION/spam.csv
ERROR: File not found at the expected path. Check your zip contents.


In [2]:

#  2: IMPORTS, NLTK , and Global Definitions (FIXED)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

print("Checking NLTK dependencies...")

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
    try:
        nltk.data.find('corpora/omw-1.4')
    except LookupError:
        nltk.download('omw-1.4')

print("All NLTK dependencies checked/downloaded!")

# --- Define Global Variables (MUST BE DEFINED HERE) ---
# This path needs to match the structure created by your unzip step (Cell 1)
# FOLDER_NAME: EMAIL SPAM DETECTION
DATASET_PATH =  'email spam detection/spam.csv'
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


print("All imports and dependencies are ready!")

Checking NLTK dependencies...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


All NLTK dependencies checked/downloaded!
All imports and dependencies are ready!


In [3]:

# 3: DATA LOADING AND LABEL ENCODING


import pandas as pd

try:

    df = pd.read_csv(DATASET_PATH, sep=',', encoding='latin-1', header=None)


    df = df.iloc[:, :2]
    df.columns = ['label', 'message']


    df['label'] = df['label'].astype(str).str.strip().str.lower()

    # Label Encoding: Convert 'ham'/'spam' into 0/1 for the model
    mapping = {'ham': 0, 'spam': 1}
    df['label_encoded'] = df['label'].map(mapping)

    # Detect any unmapped/invalid labels and drop them
    unmapped = df[df['label_encoded'].isna()]
    if not unmapped.empty:
        print(f"\nWARNING: Dropping {unmapped.shape[0]} rows with unmappable labels.")
        df = df.drop(unmapped.index).reset_index(drop=True)

    print("\nData Loading Complete.")
    spam_count = (df['label'] == 'spam').sum()
    print(f"Total messages loaded: {df.shape[0]}. Spam messages: {spam_count}")
    print("\nDataFrame Head:")
    print(df.head())

except FileNotFoundError:
    print(f"\nCRITICAL ERROR: File not found at '{DATASET_PATH}'. Please check Cell 1.")
    exit()



Data Loading Complete.
Total messages loaded: 5572. Spam messages: 747

DataFrame Head:
  label                                            message  label_encoded
0   ham  Go until jurong point, crazy.. Available only ...            0.0
1   ham                      Ok lar... Joking wif u oni...            0.0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...            1.0
3   ham  U dun say so early hor... U c already then say...            0.0
4   ham  Nah I don't think he goes to usf, he lives aro...            0.0


In [4]:

 #4: TEXT PREPROCESSING AND CLEANING


def text_cleaning(text):
    """
    Cleans the input text by removing punctuation, converting to lowercase,
    removing stopwords, and applying lemmatization.
    """

    text = text.translate(str.maketrans('', '', string.punctuation))

    #  Tokenize and convert to lowercase
    tokens = text.lower().split()

    cleaned_tokens = []
    for word in tokens:
        #  Remove stopwords and check if purely alphabetic
        if word not in stop_words and word.isalpha():
            #  Apply Lemmatization
            cleaned_tokens.append(lemmatizer.lemmatize(word))

    #  Join the cleaned words back into a single string
    return " ".join(cleaned_tokens)

# Apply the cleaning function to the entire 'message' column
df['cleaned_message'] = df['message'].apply(text_cleaning)
print("Text Cleaning and Lemmatization Complete.")

# Display cleaned messages
print("\nExample of Cleaned Data:")
print(df[['message', 'cleaned_message']].head())

Text Cleaning and Lemmatization Complete.

Example of Cleaned Data:
                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned_message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts may...  
3                u dun say early hor u c already say  
4           nah dont think go usf life around though  


In [5]:

# 5: TF-IDF AND MODEL TRAINING


#  Feature Extraction (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_message']).toarray()
y = df['label_encoded'] # Our target variable (0s and 1s)

print("\nTF-IDF Vectorization Complete.")
print(f"Feature matrix (X) shape: {X.shape} (5000 most important words used as features)")

#  Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(f"\nData Split Complete. Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

#  Train the Multinomial Naive Bayes model


model = MultinomialNB()
model.fit(X_train, y_train)

print("Model Training Complete (Multinomial Naive Bayes)")


TF-IDF Vectorization Complete.
Feature matrix (X) shape: (5572, 5000) (5000 most important words used as features)

Data Split Complete. Training samples: 4457, Testing samples: 1115
Model Training Complete (Multinomial Naive Bayes)


In [6]:

# 6: MODEL EVALUATION AND LIVE PREDICTION



# 1. Model Evaluation
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=['Ham (0)', 'Spam (1)'])

print("\n--- Model Evaluation Results ---")
print(f"Overall Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix (Mistake Analysis):")
# Reads as: (True Ham / Predicted Ham), (True Ham / Predicted Spam)
#           (True Spam / Predicted Ham), (True Spam / Predicted Spam)
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


# 2. Live Prediction Function
def predict_message(message):
    """Cleans, vectorizes, and predicts the class of a single message."""
    # Ensure the cleaning is identical to the training phase
    cleaned_msg = text_cleaning(message)

    # Vectorize using the *fitted* vectorizer
    X_new = vectorizer.transform([cleaned_msg])

    # Predict the label
    prediction = model.predict(X_new)[0]

    return "SPAM 🚨" if prediction == 1 else "HAM ✅"

# Test cases
test_messages = [
    "URGENT! You have won $1000 cash prize! Claim now by texting back!",
    "Hey, what time are we meeting for coffee tomorrow?",
    "Congrats! Get your FREE entry to the final draw. Call 0800-456-789.",
    "Did you remember to send the final report to the manager?",
    "Your account has been suspended. Click this link to verify.",
]

print("\n" + "="*50)
print("             LIVE PREDICTION TEST")
print("="*50)

for msg in test_messages:
    result = predict_message(msg)
    print(f"CLASSIFICATION: {result}")
    print(f"MESSAGE: '{msg}'\n")


--- Model Evaluation Results ---
Overall Accuracy: 0.9668

Confusion Matrix (Mistake Analysis):
[[964   1]
 [ 36 114]]

Classification Report:
              precision    recall  f1-score   support

     Ham (0)       0.96      1.00      0.98       965
    Spam (1)       0.99      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


             LIVE PREDICTION TEST
CLASSIFICATION: SPAM 🚨
MESSAGE: 'URGENT! You have won $1000 cash prize! Claim now by texting back!'

CLASSIFICATION: HAM ✅
MESSAGE: 'Hey, what time are we meeting for coffee tomorrow?'

CLASSIFICATION: SPAM 🚨
MESSAGE: 'Congrats! Get your FREE entry to the final draw. Call 0800-456-789.'

CLASSIFICATION: HAM ✅
MESSAGE: 'Did you remember to send the final report to the manager?'

CLASSIFICATION: SPAM 🚨
MESSAGE: 'Your account has been suspended. Click this link to verify.'

