<a href="https://colab.research.google.com/github/priyanshita/spam-mail-detection/blob/main/SPAM_MAIL_DETECTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string

# Load the dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('spam.csv', encoding='latin-1')

# Display basic info
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:", df.columns.tolist())

# Clean column names if needed (spam.csv often has extra columns)
# Typically has columns like: v1 (label) and v2 (message)
df = df.iloc[:, :2]  # Keep only first 2 columns
df.columns = ['label', 'message']

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)
print(f"\nDataset shape after removing duplicates: {df.shape}")

# Check class distribution
print("\nClass distribution:")
print(df['label'].value_counts())

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply preprocessing
df['cleaned_message'] = df['message'].apply(preprocess_text)

# Encode labels (ham=0, spam=1)
df['label_encoded'] = df['label'].map({'ham': 0, 'spam': 1})

# Split data
X = df['cleaned_message']
y = df['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=3000, min_df=2, max_df=0.8, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(f"\nFeature matrix shape: {X_train_vec.shape}")

# Train multiple models
print("\n" + "="*50)
print("MODEL TRAINING AND EVALUATION")
print("="*50)

# 1. Naive Bayes
print("\n1. Multinomial Naive Bayes:")
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
nb_pred = nb_model.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, nb_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, nb_pred, target_names=['Ham', 'Spam']))

# 2. Logistic Regression
print("\n2. Logistic Regression:")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_vec, y_train)
lr_pred = lr_model.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, lr_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_pred, target_names=['Ham', 'Spam']))

# 3. SVM
print("\n3. Support Vector Machine:")
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_vec, y_train)
svm_pred = svm_model.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, svm_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, svm_pred, target_names=['Ham', 'Spam']))

# Confusion Matrix for best model (usually NB or LR)
print("\n" + "="*50)
print("CONFUSION MATRIX (Naive Bayes)")
print("="*50)
cm = confusion_matrix(y_test, nb_pred)
print(cm)
print("\n[[TN  FP]")
print(" [FN  TP]]")

# Function to predict new messages
def predict_message(message, model=nb_model, vectorizer=vectorizer):
    cleaned = preprocess_text(message)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    probability = model.predict_proba(vectorized)[0]

    label = "SPAM" if prediction == 1 else "HAM"
    confidence = probability[prediction] * 100

    return label, confidence

# Test with sample messages
print("\n" + "="*50)
print("TESTING WITH SAMPLE MESSAGES")
print("="*50)

test_messages = [
    "Congratulations! You've won a $1000 gift card. Click here to claim now!",
    "Hey, are we still meeting for lunch tomorrow?",
    "URGENT: Your account has been compromised. Verify your details immediately!",
    "Can you send me the project report by EOD?"
]

for msg in test_messages:
    label, conf = predict_message(msg)
    print(f"\nMessage: {msg[:60]}...")
    print(f"Prediction: {label} (Confidence: {conf:.2f}%)")

print("\n" + "="*50)
print("Model training complete!")
print("="*50)

Saving spam.csv to spam (2).csv
Dataset Shape: (5572, 5)

First few rows:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

Column names: ['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

Missing values:
label      0
message    0
dtype: int64

Dataset shape after removing duplicates: (5169, 2)

Class distribution:
label
ham     4516
spam     653
Name: count, dtype: int64

Training set size: 4135
Test set size: 1034

Feature matrix shape: (4135,