In [8]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import gradio as gr
#from google.colab import drive

# Download required NLTK data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Mount Google Drive
# drive.mount('/content/drive') -- upload the provided csv file in drive to use this line  or option 2
pd.read_csv("spam_ham_dataset.csv")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [7]:
# ============= DATA LOADING AND PREPROCESSING =============
print("Loading dataset...")
#file_path = '/content/drive/MyDrive/SpamEmail_Internship/spam_ham_dataset.csv'
file_path = 'spam_ham_dataset.csv'
df = pd.read_csv(file_path)
df = df[['label', 'text', 'label_num']]

print("\nInitial data shape:", df.shape)

# Remove duplicates and null values
df.drop_duplicates(inplace=True)
df = df.dropna()

print("After cleaning:", df.shape)
print("\nSpam and Ham distribution:\n", df['label'].value_counts())

# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_email(text):
    """Clean and preprocess email text"""
    if not isinstance(text, str):
        return ""
    text = text.replace('\n', ' ')
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9:\/\.]', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stop_words and len(word) > 1]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

# Apply text cleaning
print("\nCleaning text data...")
df['clean_text'] = df['text'].apply(clean_email)


Loading dataset...

Initial data shape: (5171, 3)
After cleaning: (4994, 3)

Spam and Ham distribution:
 label
ham     3531
spam    1463
Name: count, dtype: int64

Cleaning text data...


In [9]:
# ============= MODEL TRAINING =============
print("\nApplying TF-IDF vectorization...")
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_df=0.9)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label_num']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining Naive Bayes model...")
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"\nTraining Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save model and vectorizer
joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("\n‚úì Model and vectorizer saved successfully")



Applying TF-IDF vectorization...

Training Naive Bayes model...

Training Accuracy: 0.9742
Test Accuracy: 0.8769

Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       712
           1       1.00      0.57      0.73       287

    accuracy                           0.88       999
   macro avg       0.93      0.79      0.82       999
weighted avg       0.90      0.88      0.86       999


‚úì Model and vectorizer saved successfully


In [10]:
# ============= WEB APP INTERFACE =============
def predict_spam(email_text):
    """Predict if email is spam or ham with confidence score"""
    if not email_text or email_text.strip() == "":
        return "Please enter email text", "", "", None

    # Clean the text
    cleaned = clean_email(email_text)

    # Transform and predict
    X_input = vectorizer.transform([cleaned])
    prediction = model.predict(X_input)
    probability = model.predict_proba(X_input)

    # Get result
    result = "üö® SPAM" if prediction[0] == 1 else "‚úÖ HAM (Not Spam)"
    confidence = max(probability[0]) * 100

    # Create detailed output
    spam_prob = probability[0][1] * 100
    ham_prob = probability[0][0] * 100

    prediction_text = f"**{result}**"
    confidence_text = f"Confidence: **{confidence:.2f}%**\n\n"
    confidence_text += f"- Spam Probability: {spam_prob:.2f}%\n"
    confidence_text += f"- Ham Probability: {ham_prob:.2f}%"

    # Create bar chart
    import matplotlib.pyplot as plt

    fig, ax = plt.subplots(figsize=(8, 5))
    classes = ['Ham', 'Spam']
    probabilities = [ham_prob, spam_prob]
    colors = ['#2ecc71', '#e74c3c']  # Green for Ham, Red for Spam

    bars = ax.bar(classes, probabilities, color=colors, edgecolor='black', linewidth=1.5)

    # Add percentage labels on bars
    for bar, prob in zip(bars, probabilities):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{prob:.2f}%',
                ha='center', va='bottom', fontsize=14, fontweight='bold')

    ax.set_ylabel('Confidence (%)', fontsize=12, fontweight='bold')
    ax.set_xlabel('Class', fontsize=12, fontweight='bold')
    ax.set_title('Prediction Confidence Level', fontsize=14, fontweight='bold')
    ax.set_ylim(0, max(probabilities) + 10)
    ax.grid(axis='y', alpha=0.3, linestyle='--')

    plt.tight_layout()

    return prediction_text, confidence_text, cleaned, fig

# Create Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="Spam Email Detector") as demo:
    gr.Markdown("""
    # üìß Spam Email Detector
    ### Powered by Machine Learning (Naive Bayes Classifier)

    Enter an email text below to check if it's spam or legitimate (ham).
    """)

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Email Text",
                placeholder="Enter email content here...",
                lines=10,
                max_lines=15
            )

            with gr.Row():
                submit_btn = gr.Button("üîç Analyze Email", variant="primary")
                clear_btn = gr.ClearButton([input_text], value="Clear")

            # Example emails
            gr.Examples(
                examples=[
                    ["Congratulations!!! You have won a FREE iPhone. Click here to claim: http://scam-link.com"],
                    ["Hi John, can we schedule a meeting tomorrow at 3 PM to discuss the project?"],
                    ["URGENT: Your account will be suspended. Verify your identity now by clicking this link."],
                    ["Dear team, please find attached the quarterly report for your review."],
                    ["Get rich quick! Invest $100 and earn $10000 in just one week! Limited offer!"]
                ],
                inputs=input_text,
                label="Try these examples:"
            )

        with gr.Column():
            prediction_output = gr.Markdown(label="Prediction")
            confidence_output = gr.Markdown(label="Confidence Score")

            # Add plot for confidence comparison
            confidence_plot = gr.Plot(label="Confidence Comparison")

            cleaned_output = gr.Textbox(
                label="Cleaned/Processed Text",
                lines=5,
                max_lines=10,
                interactive=False
            )

    # Model Stats
    with gr.Accordion("üìä Model Performance Stats", open=False):
        gr.Markdown(f"""
        - **Training Accuracy:** {train_score:.4f}
        - **Test Accuracy:** {test_score:.4f}
        - **Total Samples:** {len(df)}
        - **Algorithm:** Multinomial Naive Bayes
        - **Features:** TF-IDF (1-gram and 2-gram)
        """)

    # Connect the button to the function
    submit_btn.click(
        fn=predict_spam,
        inputs=input_text,
        outputs=[prediction_output, confidence_output, cleaned_output, confidence_plot]
    )

# Launch the app
print("\n" + "="*60)
print("üöÄ Launching Spam Email Detector Web App...")
print("="*60)

demo.launch(
    share=True,  # Creates a public URL you can share
    debug=True
)

  with gr.Blocks(theme=gr.themes.Soft(), title="Spam Email Detector") as demo:



üöÄ Launching Spam Email Detector Web App...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a90101afb2580d7bb4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a90101afb2580d7bb4.gradio.live


