In [1]:
import pandas as pd
import numpy as np
import re
import string
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from lime.lime_text import LimeTextExplainer

# 1️⃣ Load the datasets (Modify the filenames accordingly)
file_path_1 = "CEAS_08.csv"  # Update with actual filename
file_path_2 = "Phishing_Email_Analysis.csv"  # Update with actual filename

df1 = pd.read_csv(file_path_1)
df2 = pd.read_csv(file_path_2)

# Combine both datasets
df = pd.concat([df1, df2], ignore_index=True)

# 2️⃣ Data Cleaning & Preprocessing
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text.strip()

df["subject_clean"] = df["subject"].apply(clean_text)
df["body_clean"] = df["body"].apply(clean_text)

# Combine subject and body for training
df["combined_text"] = df["subject_clean"] + " " + df["body_clean"]

# 3️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# 4️⃣ TF-IDF Vectorization + Logistic Regression Model
model_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),  # Convert text to numerical data
    ("classifier", LogisticRegression())  # Train logistic regression model
])

# Train model
model_pipeline.fit(X_train, y_train)

# Save the model
joblib.dump(model_pipeline, "/mnt/data/phishing_email_model.pkl")

# 5️⃣ Evaluate the model
y_pred = model_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 6️⃣ Function for Email Prediction with Explanation
def analyze_email(email_subject, email_body):
    input_text = clean_text(email_subject) + " " + clean_text(email_body)
    confidence = model_pipeline.predict_proba([input_text])[0]
    
    # Interpret the prediction
    phishing_prob = confidence[1] * 100  # Percentage probability of phishing
    legit_prob = confidence[0] * 100  # Percentage probability of legitimate
    prediction = "Phishing" if phishing_prob > legit_prob else "Legitimate"
    
    # LIME Explainer for explanations
    explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])
    exp = explainer.explain_instance(input_text, model_pipeline.predict_proba, num_features=5)
    
    explanation = exp.as_list()
    
    return {
        "classification": prediction,
        "confidence": phishing_prob if prediction == "Phishing" else legit_prob,
        "explanation": explanation
    }

# 7️⃣ Test on a Sample Email
sample_email = {
    "subject": "Urgent: Verify your bank details now!",
    "body": "Your account has been flagged. Please verify your details immediately at http://fakebank.com/login."
}

result = analyze_email(sample_email["subject"], sample_email["body"])
print(result)


ModuleNotFoundError: No module named 'lime'

In [3]:
pip install lime


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: lime
  Building wheel for lime (setup.py): started
  Building wheel for lime (setup.py): finished with status 'done'
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283886 sha256=4a6ab40ec4b3a596862dee6f6a5e6cc78baa86bce738c5a2e3c0f3e825fa3c2b
  Stored in directory: c:\users\nishant\appdata\local\pip\cache\wheels\85\fa\a3\9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import re
import string
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from lime.lime_text import LimeTextExplainer

# 1️⃣ Load the datasets (Modify the filenames accordingly)
file_path_1 = "CEAS_08.csv"  # Update with actual filename
file_path_2 = "Phishing_Email_Analysis.csv"  # Update with actual filename

df1 = pd.read_csv(file_path_1)
df2 = pd.read_csv(file_path_2)

# Combine both datasets
df = pd.concat([df1, df2], ignore_index=True)

# 2️⃣ Data Cleaning & Preprocessing
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text.strip()

df["subject_clean"] = df["subject"].apply(clean_text)
df["body_clean"] = df["body"].apply(clean_text)

# Combine subject and body for training
df["combined_text"] = df["subject_clean"] + " " + df["body_clean"]

# 3️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# 4️⃣ TF-IDF Vectorization + Logistic Regression Model
model_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),  # Convert text to numerical data
    ("classifier", LogisticRegression())  # Train logistic regression model
])

# Train model
model_pipeline.fit(X_train, y_train)

# Save the model
joblib.dump(model_pipeline, "/mnt/data/phishing_email_model.pkl")

# 5️⃣ Evaluate the model
y_pred = model_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 6️⃣ Function for Email Prediction with Explanation
def analyze_email(email_subject, email_body):
    input_text = clean_text(email_subject) + " " + clean_text(email_body)
    confidence = model_pipeline.predict_proba([input_text])[0]
    
    # Interpret the prediction
    phishing_prob = confidence[1] * 100  # Percentage probability of phishing
    legit_prob = confidence[0] * 100  # Percentage probability of legitimate
    prediction = "Phishing" if phishing_prob > legit_prob else "Legitimate"
    
    # LIME Explainer for explanations
    explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])
    exp = explainer.explain_instance(input_text, model_pipeline.predict_proba, num_features=5)
    
    explanation = exp.as_list()
    
    return {
        "classification": prediction,
        "confidence": phishing_prob if prediction == "Phishing" else legit_prob,
        "explanation": explanation
    }

# 7️⃣ Test on a Sample Email
sample_email = {
    "subject": "Urgent: Verify your bank details now!",
    "body": "Your account has been flagged. Please verify your details immediately at http://fakebank.com/login."
}

result = analyze_email(sample_email["subject"], sample_email["body"])
print(result)


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/phishing_email_model.pkl'

In [6]:
import joblib

# Save the model in the current working directory
joblib.dump(model_pipeline, "phishing_email_model.pkl")


['phishing_email_model.pkl']

In [10]:
import pandas as pd
import numpy as np
import re
import string
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from lime.lime_text import LimeTextExplainer
import urllib.parse

# 1️⃣ Load the datasets (Modify filenames)
file_path_1 = "CEAS_08.csv"  
file_path_2 = "Phishing_Email_Analysis.csv"  

df1 = pd.read_csv(file_path_1)
df2 = pd.read_csv(file_path_2)

# Combine both datasets
df = pd.concat([df1, df2], ignore_index=True)

# 2️⃣ Enhanced Data Cleaning & Preprocessing
def clean_text(text):
    """Clean email subject & body (remove URLs, punctuation, numbers, lowercase)"""
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text.strip()

df["subject_clean"] = df["subject"].apply(clean_text)
df["body_clean"] = df["body"].apply(clean_text)

# Combine subject and body for training
df["combined_text"] = df["subject_clean"] + " " + df["body_clean"]

# 3️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# 4️⃣ TF-IDF Vectorization + Logistic Regression Model
model_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),  
    ("classifier", LogisticRegression(solver="liblinear"))  
])

# Train model
model_pipeline.fit(X_train, y_train)

# Save the model
joblib.dump(model_pipeline, "phishing_email_model.pkl")

# 5️⃣ Evaluate the model
y_pred = model_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# 6️⃣ Enhanced Email Analysis Function
def analyze_email(email_subject, email_body):
    """
    Analyzes an email's subject & body, detects phishing, and explains why.
    """
    input_text = clean_text(email_subject) + " " + clean_text(email_body)
    confidence = model_pipeline.predict_proba([input_text])[0]

    phishing_prob = confidence[1] * 100  
    legit_prob = confidence[0] * 100  
    prediction = "Phishing" if phishing_prob > legit_prob else "Legitimate"

    # 🔍 **LIME Explainer**
    explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])
    exp = explainer.explain_instance(input_text, model_pipeline.predict_proba, num_features=5)
    explanation = exp.as_list()

    # 🔗 **URL Analysis**
    urls = re.findall(r'(https?://\S+)', email_body)
    suspicious_urls = [url for url in urls if "bank" in url or "secure" in url or "verify" in url]

    # 🚨 **Phishing Keywords**
    phishing_keywords = ["verify", "account", "urgent", "security", "confirm", "click here", "suspended"]
    flagged_words = [word for word in input_text.split() if word in phishing_keywords]

    # 📊 **Final Report**
    result = {
        "Prediction": prediction,
        "Confidence": f"{phishing_prob:.2f}%" if prediction == "Phishing" else f"{legit_prob:.2f}%",
        "Explanation": explanation,
        "URLs Found": urls,
        "Suspicious URLs": suspicious_urls,
        "Flagged Words": flagged_words
    }
    return result


# 7️⃣ **Test with Example Email**
test_email = {
    "subject": "Urgent: Your account has been compromised!",
    "body": "We detected suspicious activity. Please verify your identity now at http://securebank-verify.com"
}

result = analyze_email(test_email["subject"], test_email["body"])

print("\n📩 **Email Analysis Result**")
print("📌 Prediction:", result["Prediction"])
print("🎯 Confidence:", result["Confidence"])
print("🔍 Explanation:\n", result["Explanation"])
print("🔗 URLs Found:", result["URLs Found"])
print("⚠️ Suspicious URLs:", result["Suspicious URLs"])
print("🚨 Flagged Words:", result["Flagged Words"])


Accuracy: 0.9957859788021964
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6925
           1       1.00      1.00      1.00      8737

    accuracy                           1.00     15662
   macro avg       1.00      1.00      1.00     15662
weighted avg       1.00      1.00      1.00     15662


📩 **Email Analysis Result**
📌 Prediction: Phishing
🎯 Confidence: 79.58%
🔍 Explanation:
 [('urgent', 0.07365591314642927), ('detected', -0.07138662745775409), ('account', -0.05397428018235597), ('activity', 0.031069414646774936), ('identity', -0.029319570267489138)]
🔗 URLs Found: ['http://securebank-verify.com']
⚠️ Suspicious URLs: ['http://securebank-verify.com']
🚨 Flagged Words: ['urgent', 'account', 'verify']


In [12]:
import pandas as pd
import numpy as np
import re
import string
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from lime.lime_text import LimeTextExplainer

# 1️⃣ Load the datasets (Modify filenames)
file_path_1 = "CEAS_08.csv"  
file_path_2 = "Phishing_Email_Analysis.csv"  

df1 = pd.read_csv(file_path_1)
df2 = pd.read_csv(file_path_2)

# Combine both datasets
df = pd.concat([df1, df2], ignore_index=True)

# 2️⃣ Enhanced Data Cleaning & Preprocessing
def clean_text(text):
    """Clean email subject & body (remove URLs, punctuation, numbers, lowercase)"""
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text.strip()

df["subject_clean"] = df["subject"].apply(clean_text)
df["body_clean"] = df["body"].apply(clean_text)

# Combine subject and body for training
df["combined_text"] = df["subject_clean"] + " " + df["body_clean"]

# 3️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# 4️⃣ TF-IDF Vectorization + Logistic Regression Model
model_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),  
    ("classifier", LogisticRegression(solver="liblinear"))  
])

# Train model
model_pipeline.fit(X_train, y_train)

# Save the model
joblib.dump(model_pipeline, "phishing_email_model.pkl")

# 5️⃣ Evaluate the model
y_pred = model_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 6️⃣ Enhanced Email Analysis Function
def analyze_email(email_subject, email_body):
    """
    Analyzes an email's subject & body, detects phishing, and explains why.
    """
    input_text = clean_text(email_subject) + " " + clean_text(email_body)
    confidence = model_pipeline.predict_proba([input_text])[0]

    phishing_prob = confidence[1] * 100  
    legit_prob = confidence[0] * 100  
    prediction = "Phishing" if phishing_prob > legit_prob else "Legitimate"

    # 🔍 **LIME Explainer**
    explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])
    exp = explainer.explain_instance(input_text, model_pipeline.predict_proba, num_features=5)
    explanation = exp.as_list()

    # 🔗 **URL Analysis**
    urls = re.findall(r'(https?://\S+)', email_body)
    suspicious_urls = [url for url in urls if "bank" in url or "secure" in url or "verify" in url]

    # 🚨 **Phishing Keywords**
    phishing_keywords = ["verify", "account", "urgent", "security", "confirm", "click here", "suspended"]
    flagged_words = [word for word in input_text.split() if word in phishing_keywords]

    # 📊 **Final Report**
    result = {
        "Prediction": prediction,
        "Confidence": f"{phishing_prob:.2f}%" if prediction == "Phishing" else f"{legit_prob:.2f}%",
        "Explanation": explanation,
        "URLs Found": urls,
        "Suspicious URLs": suspicious_urls,
        "Flagged Words": flagged_words
    }
    return result


# 7️⃣ **Test with Example Email**
test_email = {
    "subject": "Urgent: Your account has been compromised!",
    "body": "We detected suspicious activity. Please verify your identity now at http://securebank-verify.com"
}

result = analyze_email(test_email["subject"], test_email["body"])

print("\n📩 **Email Analysis Result**")
print("📌 Prediction:", result["Prediction"])
print("🎯 Confidence:", result["Confidence"])
print("🔍 Explanation:\n", result["Explanation"])
print("🔗 URLs Found:", result["URLs Found"])
print("⚠️ Suspicious URLs:", result["Suspicious URLs"])
print("🚨 Flagged Words:", result["Flagged Words"])


Accuracy: 0.9957859788021964
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6925
           1       1.00      1.00      1.00      8737

    accuracy                           1.00     15662
   macro avg       1.00      1.00      1.00     15662
weighted avg       1.00      1.00      1.00     15662


📩 **Email Analysis Result**
📌 Prediction: Phishing
🎯 Confidence: 79.58%
🔍 Explanation:
 [('urgent', 0.07429990012736311), ('detected', -0.07205503370305869), ('account', -0.05422252742051346), ('activity', 0.03219197837649734), ('identity', -0.028929899706333882)]
🔗 URLs Found: ['http://securebank-verify.com']
⚠️ Suspicious URLs: ['http://securebank-verify.com']
🚨 Flagged Words: ['urgent', 'account', 'verify']


In [22]:
# Debug: Print raw output of one sample
debug_result = analyze_email("Test Subject", "Test body with suspicious link http://fake-url.com")
print(debug_result)


{'Prediction': 'Legitimate', 'Confidence': '87.56%', 'Explanation': [('test', -0.38462414944727297), ('subject', -0.15401355131437522), ('link', -0.03592320896644274), ('body', -0.01681644916830811), ('suspicious', -0.007094499498575421)], 'URLs Found': ['http://fake-url.com'], 'Suspicious URLs': [], 'Flagged Words': []}


In [28]:
TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1, 2))


In [32]:
def analyze_email(email_subject, email_body):
    input_text = clean_text(email_subject) + " " + clean_text(email_body)
    confidence = model_pipeline.predict_proba([input_text])[0]

    phishing_prob = confidence[1] * 100  # Probability of phishing
    legit_prob = confidence[0] * 100     # Probability of legitimate

    # 👉 Adjusted threshold (default is usually 50%)
    prediction = "Phishing" if phishing_prob >= 60 else "Legitimate"

    # Explanation via LIME
    explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])
    exp = explainer.explain_instance(input_text, model_pipeline.predict_proba, num_features=5)
    explanation = exp.as_list()

    return {
        "Prediction": prediction,
        "Confidence": phishing_prob if prediction == "Phishing" else legit_prob,
        "Explanation": explanation,
        "URLs Found": re.findall(r"http[s]?://\S+", email_body),
        "Suspicious URLs": [url for url in re.findall(r"http[s]?://\S+", email_body) if "secure" in url or "login" in url],
        "Flagged Words": [word for word in ["verify", "login", "click", "action", "free", "reward", "win"] if word in input_text]
    }


In [34]:
# ✅ Legitimate Emails
test_emails = [
    {
        "subject": "Your Google account security settings have been updated",
        "body": "We've noticed a new sign-in from a known device. No action is needed if this was you. Visit https://myaccount.google.com/security for details."
    },
    {
        "subject": "Receipt from Apple",
        "body": "Your payment for Apple Music has been processed successfully. View or manage your subscription here: https://appleid.apple.com"
    },
    {
        "subject": "Thanks for shopping with Amazon!",
        "body": "Your order has been shipped and will arrive soon. You can track your package using this link: https://www.amazon.com/track-order"
    },
    
    # 🚨 Phishing Emails
    {
        "subject": "Immediate Action Required: Your Netflix Payment Failed",
        "body": "We're unable to process your payment. Update your billing info at http://netflix-billing.com/update to avoid account suspension."
    },
    {
        "subject": "Security Alert: Suspicious Login Attempt Detected",
        "body": "We have noticed suspicious activity in your bank account. Please confirm your identity now at http://securebank-verify.com"
    },
    {
        "subject": "Congrats! You've Won a Free iPhone 14",
        "body": "Click now to claim your reward: http://winfreeiphone.com. This offer is only valid for 24 hours!"
    }
]


In [36]:
for idx, email in enumerate(test_emails):
    print(f"\n📩 Test Email #{idx + 1}")
    result = analyze_email(email["subject"], email["body"])
    
    print(f"📌 Prediction: {result['Prediction']}")
    print(f"🎯 Confidence: {result['Confidence']:.2f}%")
    print(f"🔍 Explanation:")
    for word, weight in result["Explanation"]:
        direction = "↑ phishing" if weight > 0 else "↓ legitimate"
        print(f"   - '{word}' → {weight:.4f} ({direction})")
    print("=" * 100)



📩 Test Email #1
📌 Prediction: Legitimate
🎯 Confidence: 70.04%
🔍 Explanation:
   - 'google' → -0.1903 (↓ legitimate)
   - 'security' → -0.1163 (↓ legitimate)
   - 'action' → -0.0812 (↓ legitimate)
   - 'device' → -0.0712 (↓ legitimate)
   - 'settings' → 0.0574 (↑ phishing)

📩 Test Email #2
📌 Prediction: Phishing
🎯 Confidence: 71.45%
🔍 Explanation:
   - 'payment' → 0.2916 (↑ phishing)
   - 'music' → -0.1438 (↓ legitimate)
   - 'subscription' → -0.1084 (↓ legitimate)
   - 'view' → -0.0894 (↓ legitimate)
   - 'apple' → 0.0356 (↑ phishing)

📩 Test Email #3
📌 Prediction: Legitimate
🎯 Confidence: 74.13%
🔍 Explanation:
   - 'thanks' → -0.3796 (↓ legitimate)
   - 'using' → -0.1505 (↓ legitimate)
   - 'shopping' → 0.0552 (↑ phishing)
   - 'amazon' → 0.0509 (↑ phishing)
   - 'track' → -0.0458 (↓ legitimate)

📩 Test Email #4
📌 Prediction: Phishing
🎯 Confidence: 88.57%
🔍 Explanation:
   - 'payment' → 0.4082 (↑ phishing)
   - 'failed' → -0.0913 (↓ legitimate)
   - 'update' → -0.0617 (↓ legitimate)


In [38]:
def trusted_domain_present(body):
    trusted_domains = ["apple.com", "google.com", "amazon.com", "myaccount.google.com", "appleid.apple.com"]
    urls = re.findall(r"http[s]?://\S+", body)
    return any(any(domain in url for domain in trusted_domains) for url in urls)


In [40]:
# ✅ Legitimate Emails
test_emails = [
    {
        "subject": "Your Google account security settings have been updated",
        "body": "We've noticed a new sign-in from a known device. No action is needed if this was you. Visit https://myaccount.google.com/security for details."
    },
    {
        "subject": "Receipt from Apple",
        "body": "Your payment for Apple Music has been processed successfully. View or manage your subscription here: https://appleid.apple.com"
    },
    {
        "subject": "Thanks for shopping with Amazon!",
        "body": "Your order has been shipped and will arrive soon. You can track your package using this link: https://www.amazon.com/track-order"
    },
    
    # 🚨 Phishing Emails
    {
        "subject": "Immediate Action Required: Your Netflix Payment Failed",
        "body": "We're unable to process your payment. Update your billing info at http://netflix-billing.com/update to avoid account suspension."
    },
    {
        "subject": "Security Alert: Suspicious Login Attempt Detected",
        "body": "We have noticed suspicious activity in your bank account. Please confirm your identity now at http://securebank-verify.com"
    },
    {
        "subject": "Congrats! You've Won a Free iPhone 14",
        "body": "Click now to claim your reward: http://winfreeiphone.com. This offer is only valid for 24 hours!"
    }
]


In [42]:
for idx, email in enumerate(test_emails):
    print(f"\n📩 Test Email #{idx + 1}")
    result = analyze_email(email["subject"], email["body"])
    
    print(f"📌 Prediction: {result['Prediction']}")
    print(f"🎯 Confidence: {result['Confidence']:.2f}%")
    print(f"🔍 Explanation:")
    for word, weight in result["Explanation"]:
        direction = "↑ phishing" if weight > 0 else "↓ legitimate"
        print(f"   - '{word}' → {weight:.4f} ({direction})")
    print("=" * 100)



📩 Test Email #1
📌 Prediction: Legitimate
🎯 Confidence: 70.04%
🔍 Explanation:
   - 'google' → -0.1914 (↓ legitimate)
   - 'security' → -0.1165 (↓ legitimate)
   - 'action' → -0.0790 (↓ legitimate)
   - 'device' → -0.0721 (↓ legitimate)
   - 'settings' → 0.0574 (↑ phishing)

📩 Test Email #2
📌 Prediction: Phishing
🎯 Confidence: 71.45%
🔍 Explanation:
   - 'payment' → 0.2909 (↑ phishing)
   - 'music' → -0.1427 (↓ legitimate)
   - 'subscription' → -0.1086 (↓ legitimate)
   - 'view' → -0.0889 (↓ legitimate)
   - 'apple' → 0.0345 (↑ phishing)

📩 Test Email #3
📌 Prediction: Legitimate
🎯 Confidence: 74.13%
🔍 Explanation:
   - 'thanks' → -0.3795 (↓ legitimate)
   - 'using' → -0.1492 (↓ legitimate)
   - 'shopping' → 0.0590 (↑ phishing)
   - 'amazon' → 0.0516 (↑ phishing)
   - 'track' → -0.0422 (↓ legitimate)

📩 Test Email #4
📌 Prediction: Phishing
🎯 Confidence: 88.57%
🔍 Explanation:
   - 'payment' → 0.4096 (↑ phishing)
   - 'failed' → -0.0910 (↓ legitimate)
   - 'update' → -0.0606 (↓ legitimate)


In [44]:
def trusted_domain_present(text):
    trusted_domains = [
        "apple.com", "google.com", "amazon.com", "spotify.com", 
        "myaccount.google.com", "appleid.apple.com", "bbc.co.uk", 
        "linkedin.com", "twitter.com", "instagram.com", "tesco.com"
    ]
    urls = re.findall(r"http[s]?://[^\s]+", text)
    return any(any(domain in url for domain in trusted_domains) for url in urls)


def analyze_email(email_subject, email_body):
    input_text = clean_text(email_subject) + " " + clean_text(email_body)
    confidence = model_pipeline.predict_proba([input_text])[0]
    
    phishing_prob = confidence[1] * 100
    legit_prob = confidence[0] * 100
    prediction = "Phishing" if phishing_prob > legit_prob else "Legitimate"

    # Adjust prediction if trusted domain is detected in legitimate context
    if prediction == "Phishing" and trusted_domain_present(email_body):
        if phishing_prob < 85:  # adjustable threshold
            prediction = "Legitimate"

    explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])
    exp = explainer.explain_instance(input_text, model_pipeline.predict_proba, num_features=5)

    explanation = exp.as_list()
    urls_found = re.findall(r"http[s]?://[^\s]+", email_body)
    flagged_urls = [url for url in urls_found if not trusted_domain_present(url)]
    phishing_keywords = ["urgent", "verify", "account", "login", "suspend", "click", "won", "free", "payment"]

    flagged_words = [word for word, _ in explanation if word in phishing_keywords]

    return {
        "Prediction": prediction,
        "Confidence": f"{phishing_prob if prediction == 'Phishing' else legit_prob:.2f}%",
        "Explanation": explanation,
        "URLs Found": urls_found,
        "Suspicious URLs": flagged_urls,
        "Flagged Words": flagged_words,
    }


In [48]:
# ✅ Legitimate Emails
test_emails = [
    {
        "subject": "Your Google account security settings have been updated",
        "body": "We've noticed a new sign-in from a known device. No action is needed if this was you. Visit https://myaccount.google.com/security for details."
    },
    {
        "subject": "Receipt from Apple",
        "body": "Your payment for Apple Music has been processed successfully. View or manage your subscription here: https://appleid.apple.com"
    },
    {
        "subject": "Thanks for shopping with Amazon!",
        "body": "Your order has been shipped and will arrive soon. You can track your package using this link: https://www.amazon.com/track-order"
    },
    
    # 🚨 Phishing Emails
    {
        "subject": "Immediate Action Required: Your Netflix Payment Failed",
        "body": "We're unable to process your payment. Update your billing info at http://netflix-billing.com/update to avoid account suspension."
    },
    {
        "subject": "Security Alert: Suspicious Login Attempt Detected",
        "body": "We have noticed suspicious activity in your bank account. Please confirm your identity now at http://securebank-verify.com"
    },
    {
        "subject": "Congrats! You've Won a Free iPhone 14",
        "body": "Click now to claim your reward: http://winfreeiphone.com. This offer is only valid for 24 hours!"
    }
]


In [56]:
result = analyze_email(email["subject"], email["body"])
theory_explanation = generate_theoretical_explanation(result["Explanation"])

print("📌 Prediction:", result["Prediction"])
print("🎯 Confidence:", result["Confidence"])
print("🔍 Keywords Explanation:")
for word, weight in result["Explanation"]:
    direction = "↑ phishing" if weight > 0 else "↓ legitimate"
    print(f"   - '{word}' → {weight:.4f} ({direction})")

print("\n🧠 Theoretical Explanation:")
print(theory_explanation)


NameError: name 'generate_theoretical_explanation' is not defined

In [58]:
def generate_theoretical_explanation(explanation_list):
    phishing_terms = []
    legit_terms = []

    for word, weight in explanation_list:
        if weight > 0:
            phishing_terms.append(word)
        else:
            legit_terms.append(word)

    explanation = "This email was classified based on important words found in its content. "

    if phishing_terms:
        explanation += (
            f"The presence of terms like {', '.join(phishing_terms)} contributed towards it being identified as *phishing*, "
            "as these words are often associated with fraudulent activity or urgency. "
        )

    if legit_terms:
        explanation += (
            f"On the other hand, terms like {', '.join(legit_terms)} are more typical in *legitimate* communication, "
            "which helped reduce suspicion. "
        )

    if not phishing_terms and not legit_terms:
        explanation += "However, no strong indicative words were found, so the prediction is based on overall language patterns."

    return explanation


In [60]:
# ✅ Legitimate Emails
test_emails = [
    {
        "subject": "Your Google account security settings have been updated",
        "body": "We've noticed a new sign-in from a known device. No action is needed if this was you. Visit https://myaccount.google.com/security for details."
    },
    {
        "subject": "Receipt from Apple",
        "body": "Your payment for Apple Music has been processed successfully. View or manage your subscription here: https://appleid.apple.com"
    },
    {
        "subject": "Thanks for shopping with Amazon!",
        "body": "Your order has been shipped and will arrive soon. You can track your package using this link: https://www.amazon.com/track-order"
    },
    
    # 🚨 Phishing Emails
    {
        "subject": "Immediate Action Required: Your Netflix Payment Failed",
        "body": "We're unable to process your payment. Update your billing info at http://netflix-billing.com/update to avoid account suspension."
    },
    {
        "subject": "Security Alert: Suspicious Login Attempt Detected",
        "body": "We have noticed suspicious activity in your bank account. Please confirm your identity now at http://securebank-verify.com"
    },
    {
        "subject": "Congrats! You've Won a Free iPhone 14",
        "body": "Click now to claim your reward: http://winfreeiphone.com. This offer is only valid for 24 hours!"
    }
]


In [72]:
for idx, email in enumerate(test_emails):
    print(f"\n📩 Test Email #{idx + 1}")
    
    result = analyze_email(email["subject"], email["body"])
    theory_explanation = generate_theoretical_explanation(result["Explanation"])

    print(f"📌 Prediction: {result['Prediction']}")
    print(f"🎯 Confidence: {float(result['Confidence'].replace('%', '')):.2f}%")
    print(f"🔍 Keywords Explanation:")
    for word, weight in result["Explanation"]:
        direction = "↑ phishing" if weight > 0 else "↓ legitimate"
        print(f"   - '{word}' → {weight:.4f} ({direction})")

    print("\n🧠 Theoretical Explanation:")
    print(theory_explanation)
    print("=" * 100)



📩 Test Email #1
📌 Prediction: Legitimate
🎯 Confidence: 70.04%
🔍 Keywords Explanation:
   - 'google' → -0.1922 (↓ legitimate)
   - 'security' → -0.1144 (↓ legitimate)
   - 'action' → -0.0820 (↓ legitimate)
   - 'device' → -0.0736 (↓ legitimate)
   - 'settings' → 0.0580 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified based on important words found in its content. The presence of terms like settings contributed towards it being identified as *phishing*, as these words are often associated with fraudulent activity or urgency. On the other hand, terms like google, security, action, device are more typical in *legitimate* communication, which helped reduce suspicion. 

📩 Test Email #2
📌 Prediction: Legitimate
🎯 Confidence: 28.55%
🔍 Keywords Explanation:
   - 'payment' → 0.2930 (↑ phishing)
   - 'music' → -0.1463 (↓ legitimate)
   - 'subscription' → -0.1108 (↓ legitimate)
   - 'view' → -0.0888 (↓ legitimate)
   - 'apple' → 0.0372 (↑ phishing)

🧠 Theoretical Explanation:
Th

In [74]:
test_emails = [
    # Legitimate
    {
        "subject": "Your Receipt from Microsoft – Office 365 Annual Subscription",
        "body": "Thank you for your purchase. Your Office 365 subscription has been renewed successfully. If you have questions, visit https://account.microsoft.com."
    },
    {
        "subject": "University of Birmingham – Class Schedule Update",
        "body": "Dear student, your Data Science class schedule has been updated. Please log in to your student portal at https://my.bham.ac.uk to view the changes."
    },

    # Phishing
    {
        "subject": "⚠️ URGENT: Your Bank Account Will Be Locked!",
        "body": "We noticed suspicious activity in your account. Confirm your details now at http://secure-login-verification-banking-alert.com to avoid account termination. Act now!"
    },
    {
        "subject": "🎁 Claim Your Free iPhone 15 Pro Today!",
        "body": "You’ve been selected to win a brand-new iPhone 15 Pro. Click here http://freeiphone-prize-now.com to claim your reward. Limited time only!"
    }
]


In [76]:
for idx, email in enumerate(test_emails):
    print(f"\n📩 Test Email #{idx + 1}")
    
    result = analyze_email(email["subject"], email["body"])
    theory_explanation = generate_theoretical_explanation(result["Explanation"])

    print(f"📌 Prediction: {result['Prediction']}")
    print(f"🎯 Confidence: {float(result['Confidence'].replace('%', '')):.2f}%")
    print(f"🔍 Keywords Explanation:")
    for word, weight in result["Explanation"]:
        direction = "↑ phishing" if weight > 0 else "↓ legitimate"
        print(f"   - '{word}' → {weight:.4f} ({direction})")

    print("\n🧠 Theoretical Explanation:")
    print(theory_explanation)
    print("=" * 100)



📩 Test Email #1
📌 Prediction: Legitimate
🎯 Confidence: 57.74%
🔍 Keywords Explanation:
   - 'subscription' → -0.2133 (↓ legitimate)
   - 'thank' → -0.1677 (↓ legitimate)
   - 'questions' → -0.0956 (↓ legitimate)
   - 'microsoft' → 0.0900 (↑ phishing)
   - 'office' → 0.0556 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified based on important words found in its content. The presence of terms like microsoft, office contributed towards it being identified as *phishing*, as these words are often associated with fraudulent activity or urgency. On the other hand, terms like subscription, thank, questions are more typical in *legitimate* communication, which helped reduce suspicion. 

📩 Test Email #2
📌 Prediction: Legitimate
🎯 Confidence: 82.87%
🔍 Keywords Explanation:
   - 'schedule' → 0.2228 (↑ phishing)
   - 'university' → -0.1765 (↓ legitimate)
   - 'log' → -0.1323 (↓ legitimate)
   - 'science' → -0.1302 (↓ legitimate)
   - 'data' → -0.1282 (↓ legitimate)

🧠 Theoretical Ex

In [82]:
def analyze_email(email_subject, email_body):
    input_text = clean_text(email_subject) + " " + clean_text(email_body)
    confidence = model_pipeline.predict_proba([input_text])[0]

    phishing_prob = confidence[1] * 100
    legit_prob = confidence[0] * 100

    # 👇 Adjust threshold here
    prediction = "Phishing" if phishing_prob >= 70 else "Legitimate"

    # LIME explanation
    explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])
    exp = explainer.explain_instance(input_text, model_pipeline.predict_proba, num_features=5)
    explanation = exp.as_list()

    return {
        "Prediction": prediction,
        "Confidence": phishing_prob if prediction == "Phishing" else legit_prob,
        "Explanation": explanation
    }


In [90]:
# ✅ Test Emails: Legitimate + Phishing
test_emails = [
    # ✅ Legitimate Emails
    {
        "subject": "Your Amazon Order Has Shipped",
        "body": "Your recent order has been shipped and is on its way. Track your package via https://www.amazon.com/track. Thank you for shopping with us."
    },
    {
        "subject": "Welcome to LinkedIn!",
        "body": "Hi, thanks for joining LinkedIn. Start building your professional network by connecting with others: https://www.linkedin.com/"
    },
    {
        "subject": "Google Account Recovery Request",
        "body": "We received a request to reset your password. If this was you, follow the instructions at https://accounts.google.com to proceed."
    },
    {
        "subject": "PayPal Transaction Receipt",
        "body": "You've successfully sent $45 to John Smith. View details at https://www.paypal.com/myaccount."
    },
    {
        "subject": "Spotify Premium Payment Confirmation",
        "body": "Thank you for subscribing to Spotify Premium. Your payment has been processed. Manage your account at https://www.spotify.com/account"
    },

    # 🚨 Phishing Emails
    {
        "subject": "🚨 Suspicious Login Attempt – Action Needed!",
        "body": "We've detected unusual login activity. Log in now at http://verify-login-now.com to secure your account."
    },
    {
        "subject": "Your Tax Refund Is Ready!",
        "body": "You’re eligible for a £1,500 refund. Click http://gov-claim-refund-now.com to claim it before it expires."
    },
    {
        "subject": "Your Account Will Be Closed!",
        "body": "Failure to verify your identity at http://bank-secure-check.com will result in account suspension. Click now."
    },
    {
        "subject": "Verify Your Netflix Billing Info Immediately",
        "body": "We couldn’t charge your card. Update your payment information here: http://netflix-update-payment.com"
    },
    {
        "subject": "You’ve Won a MacBook Pro!",
        "body": "Claim your MacBook now by visiting http://free-macbook-promo.net. Don’t miss this opportunity!"
    }
]

# ✅ Run the model and display results
for idx, email in enumerate(test_emails):
    print(f"\n📩 Test Email #{idx + 1}")
    result = analyze_email(email["subject"], email["body"])
    theory_explanation = generate_theoretical_explanation(result["Explanation"])

    print(f"📌 Prediction: {result['Prediction']}")
    try:
        confidence = float(result["Confidence"])
    except:
        confidence = float(result["Confidence"].replace('%', ''))
    print(f"🎯 Confidence: {confidence:.2f}%")

    print(f"🔍 Keywords Explanation:")
    for word, weight in result["Explanation"]:
        direction = "↑ phishing" if weight > 0 else "↓ legitimate"
        print(f"   - '{word}' → {weight:.4f} ({direction})")

    print("\n🧠 Theoretical Explanation:")
    print(theory_explanation)
    print("=" * 100)



📩 Test Email #1
📌 Prediction: Legitimate
🎯 Confidence: 53.16%
🔍 Keywords Explanation:
   - 'thank' → -0.2099 (↓ legitimate)
   - 'way' → -0.1273 (↓ legitimate)
   - 'order' → 0.0771 (↑ phishing)
   - 'track' → -0.0697 (↓ legitimate)
   - 'shopping' → 0.0352 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified based on important words found in its content. The presence of terms like order, shopping contributed towards it being identified as *phishing*, as these words are often associated with fraudulent activity or urgency. On the other hand, terms like thank, way, track are more typical in *legitimate* communication, which helped reduce suspicion. 

📩 Test Email #2
📌 Prediction: Legitimate
🎯 Confidence: 52.08%
🔍 Keywords Explanation:
   - 'thanks' → -0.3917 (↓ legitimate)
   - 'hi' → -0.1841 (↓ legitimate)
   - 'professional' → 0.1767 (↑ phishing)
   - 'start' → 0.0834 (↑ phishing)
   - 'network' → 0.0524 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified

In [94]:
def analyze_email(subject, body):
    input_text = clean_text(subject) + " " + clean_text(body)
    
    # Get prediction probabilities from the model
    probs = model_pipeline.predict_proba([input_text])[0]
    phishing_prob = probs[1] * 100
    legit_prob = probs[0] * 100
    
    # Threshold to classify as phishing (e.g., 70%)
    prediction = "Phishing" if phishing_prob >= 70 else "Legitimate"
    
    # LIME for explanation
    explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])
    exp = explainer.explain_instance(input_text, model_pipeline.predict_proba, num_features=5)
    explanation = exp.as_list()
    
    return {
        "Prediction": prediction,
        "Confidence": phishing_prob if prediction == "Phishing" else legit_prob,
        "Explanation": explanation
    }


In [96]:
# ✅ Test Emails: Legitimate + Phishing
test_emails = [
    # ✅ Legitimate Emails
    {
        "subject": "Your Amazon Order Has Shipped",
        "body": "Your recent order has been shipped and is on its way. Track your package via https://www.amazon.com/track. Thank you for shopping with us."
    },
    {
        "subject": "Welcome to LinkedIn!",
        "body": "Hi, thanks for joining LinkedIn. Start building your professional network by connecting with others: https://www.linkedin.com/"
    },
    {
        "subject": "Google Account Recovery Request",
        "body": "We received a request to reset your password. If this was you, follow the instructions at https://accounts.google.com to proceed."
    },
    {
        "subject": "PayPal Transaction Receipt",
        "body": "You've successfully sent $45 to John Smith. View details at https://www.paypal.com/myaccount."
    },
    {
        "subject": "Spotify Premium Payment Confirmation",
        "body": "Thank you for subscribing to Spotify Premium. Your payment has been processed. Manage your account at https://www.spotify.com/account"
    },

    # 🚨 Phishing Emails
    {
        "subject": "🚨 Suspicious Login Attempt – Action Needed!",
        "body": "We've detected unusual login activity. Log in now at http://verify-login-now.com to secure your account."
    },
    {
        "subject": "Your Tax Refund Is Ready!",
        "body": "You’re eligible for a £1,500 refund. Click http://gov-claim-refund-now.com to claim it before it expires."
    },
    {
        "subject": "Your Account Will Be Closed!",
        "body": "Failure to verify your identity at http://bank-secure-check.com will result in account suspension. Click now."
    },
    {
        "subject": "Verify Your Netflix Billing Info Immediately",
        "body": "We couldn’t charge your card. Update your payment information here: http://netflix-update-payment.com"
    },
    {
        "subject": "You’ve Won a MacBook Pro!",
        "body": "Claim your MacBook now by visiting http://free-macbook-promo.net. Don’t miss this opportunity!"
    }
]

# ✅ Run the model and display results
for idx, email in enumerate(test_emails):
    print(f"\n📩 Test Email #{idx + 1}")
    result = analyze_email(email["subject"], email["body"])
    theory_explanation = generate_theoretical_explanation(result["Explanation"])

    print(f"📌 Prediction: {result['Prediction']}")
    print(f"🎯 Confidence: {float(result['Confidence']):.2f}%")
    print(f"🔍 Keywords Explanation:")
    for word, weight in result["Explanation"]:
        direction = "↑ phishing" if weight > 0 else "↓ legitimate"
        print(f"   - '{word}' → {weight:.4f} ({direction})")

    print("\n🧠 Theoretical Explanation:")
    print(theory_explanation)
    print("=" * 100)



📩 Test Email #1
📌 Prediction: Legitimate
🎯 Confidence: 53.16%
🔍 Keywords Explanation:
   - 'thank' → -0.2123 (↓ legitimate)
   - 'way' → -0.1287 (↓ legitimate)
   - 'order' → 0.0778 (↑ phishing)
   - 'track' → -0.0659 (↓ legitimate)
   - 'shopping' → 0.0365 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified based on important words found in its content. The presence of terms like order, shopping contributed towards it being identified as *phishing*, as these words are often associated with fraudulent activity or urgency. On the other hand, terms like thank, way, track are more typical in *legitimate* communication, which helped reduce suspicion. 

📩 Test Email #2
📌 Prediction: Legitimate
🎯 Confidence: 52.08%
🔍 Keywords Explanation:
   - 'thanks' → -0.3915 (↓ legitimate)
   - 'hi' → -0.1827 (↓ legitimate)
   - 'professional' → 0.1756 (↑ phishing)
   - 'start' → 0.0883 (↑ phishing)
   - 'network' → 0.0538 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified

In [100]:
prediction = "Phishing" if float(result["Confidence"]) >= 60 else "Legitimate"


In [102]:
# ✅ Test Emails: Legitimate + Phishing
test_emails = [
    # ✅ Legitimate Emails
    {
        "subject": "Your Amazon Order Has Shipped",
        "body": "Your recent order has been shipped and is on its way. Track your package via https://www.amazon.com/track. Thank you for shopping with us."
    },
    {
        "subject": "Welcome to LinkedIn!",
        "body": "Hi, thanks for joining LinkedIn. Start building your professional network by connecting with others: https://www.linkedin.com/"
    },
    {
        "subject": "Google Account Recovery Request",
        "body": "We received a request to reset your password. If this was you, follow the instructions at https://accounts.google.com to proceed."
    },
    {
        "subject": "PayPal Transaction Receipt",
        "body": "You've successfully sent $45 to John Smith. View details at https://www.paypal.com/myaccount."
    },
    {
        "subject": "Spotify Premium Payment Confirmation",
        "body": "Thank you for subscribing to Spotify Premium. Your payment has been processed. Manage your account at https://www.spotify.com/account"
    },

    # 🚨 Phishing Emails
    {
        "subject": "🚨 Suspicious Login Attempt – Action Needed!",
        "body": "We've detected unusual login activity. Log in now at http://verify-login-now.com to secure your account."
    },
    {
        "subject": "Your Tax Refund Is Ready!",
        "body": "You’re eligible for a £1,500 refund. Click http://gov-claim-refund-now.com to claim it before it expires."
    },
    {
        "subject": "Your Account Will Be Closed!",
        "body": "Failure to verify your identity at http://bank-secure-check.com will result in account suspension. Click now."
    },
    {
        "subject": "Verify Your Netflix Billing Info Immediately",
        "body": "We couldn’t charge your card. Update your payment information here: http://netflix-update-payment.com"
    },
    {
        "subject": "You’ve Won a MacBook Pro!",
        "body": "Claim your MacBook now by visiting http://free-macbook-promo.net. Don’t miss this opportunity!"
    }
]

# ✅ Run the model and display results
for idx, email in enumerate(test_emails):
    result = analyze_email(email["subject"], email["body"])
    theory_explanation = generate_theoretical_explanation(result["Explanation"])
    
    prediction = "Phishing" if float(result["Confidence"]) >= 60 else "Legitimate"

    print(f"\n📩 Test Email #{idx + 1}")
    print(f"📌 Prediction: {prediction}")
    print(f"🎯 Confidence: {float(result['Confidence']):.2f}%")
    print("🔍 Keywords Explanation:")
    for word, weight in result["Explanation"]:
        direction = "↑ phishing" if weight > 0 else "↓ legitimate"
        print(f"   - '{word}' → {weight:.4f} ({direction})")
    print("\n🧠 Theoretical Explanation:")
    print(theory_explanation)
    print("=" * 100)



📩 Test Email #1
📌 Prediction: Legitimate
🎯 Confidence: 53.16%
🔍 Keywords Explanation:
   - 'thank' → -0.2113 (↓ legitimate)
   - 'way' → -0.1292 (↓ legitimate)
   - 'order' → 0.0782 (↑ phishing)
   - 'track' → -0.0678 (↓ legitimate)
   - 'shopping' → 0.0381 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified based on important words found in its content. The presence of terms like order, shopping contributed towards it being identified as *phishing*, as these words are often associated with fraudulent activity or urgency. On the other hand, terms like thank, way, track are more typical in *legitimate* communication, which helped reduce suspicion. 

📩 Test Email #2
📌 Prediction: Legitimate
🎯 Confidence: 52.08%
🔍 Keywords Explanation:
   - 'thanks' → -0.3910 (↓ legitimate)
   - 'hi' → -0.1839 (↓ legitimate)
   - 'professional' → 0.1759 (↑ phishing)
   - 'start' → 0.0890 (↑ phishing)
   - 'network' → 0.0552 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified

In [104]:
confidence = float(result["Confidence"])
if confidence >= 70:
    prediction = "Phishing"
elif confidence <= 30:
    prediction = "Legitimate"
else:
    prediction = "Uncertain"


In [106]:
# ✅ Test Emails: Legitimate + Phishing
test_emails = [
    # ✅ Legitimate Emails
    {
        "subject": "Your Amazon Order Has Shipped",
        "body": "Your recent order has been shipped and is on its way. Track your package via https://www.amazon.com/track. Thank you for shopping with us."
    },
    {
        "subject": "Welcome to LinkedIn!",
        "body": "Hi, thanks for joining LinkedIn. Start building your professional network by connecting with others: https://www.linkedin.com/"
    },
    {
        "subject": "Google Account Recovery Request",
        "body": "We received a request to reset your password. If this was you, follow the instructions at https://accounts.google.com to proceed."
    },
    {
        "subject": "PayPal Transaction Receipt",
        "body": "You've successfully sent $45 to John Smith. View details at https://www.paypal.com/myaccount."
    },
    {
        "subject": "Spotify Premium Payment Confirmation",
        "body": "Thank you for subscribing to Spotify Premium. Your payment has been processed. Manage your account at https://www.spotify.com/account"
    },

    # 🚨 Phishing Emails
    {
        "subject": "🚨 Suspicious Login Attempt – Action Needed!",
        "body": "We've detected unusual login activity. Log in now at http://verify-login-now.com to secure your account."
    },
    {
        "subject": "Your Tax Refund Is Ready!",
        "body": "You’re eligible for a £1,500 refund. Click http://gov-claim-refund-now.com to claim it before it expires."
    },
    {
        "subject": "Your Account Will Be Closed!",
        "body": "Failure to verify your identity at http://bank-secure-check.com will result in account suspension. Click now."
    },
    {
        "subject": "Verify Your Netflix Billing Info Immediately",
        "body": "We couldn’t charge your card. Update your payment information here: http://netflix-update-payment.com"
    },
    {
        "subject": "You’ve Won a MacBook Pro!",
        "body": "Claim your MacBook now by visiting http://free-macbook-promo.net. Don’t miss this opportunity!"
    }
]

# ✅ Run the model and display results
for idx, email in enumerate(test_emails):
    result = analyze_email(email["subject"], email["body"])
    theory_explanation = generate_theoretical_explanation(result["Explanation"])
    
    prediction = "Phishing" if float(result["Confidence"]) >= 60 else "Legitimate"

    print(f"\n📩 Test Email #{idx + 1}")
    print(f"📌 Prediction: {prediction}")
    print(f"🎯 Confidence: {float(result['Confidence']):.2f}%")
    print("🔍 Keywords Explanation:")
    for word, weight in result["Explanation"]:
        direction = "↑ phishing" if weight > 0 else "↓ legitimate"
        print(f"   - '{word}' → {weight:.4f} ({direction})")
    print("\n🧠 Theoretical Explanation:")
    print(theory_explanation)
    print("=" * 100)



📩 Test Email #1
📌 Prediction: Legitimate
🎯 Confidence: 53.16%
🔍 Keywords Explanation:
   - 'thank' → -0.2118 (↓ legitimate)
   - 'way' → -0.1289 (↓ legitimate)
   - 'order' → 0.0787 (↑ phishing)
   - 'track' → -0.0679 (↓ legitimate)
   - 'shopping' → 0.0372 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified based on important words found in its content. The presence of terms like order, shopping contributed towards it being identified as *phishing*, as these words are often associated with fraudulent activity or urgency. On the other hand, terms like thank, way, track are more typical in *legitimate* communication, which helped reduce suspicion. 

📩 Test Email #2
📌 Prediction: Legitimate
🎯 Confidence: 52.08%
🔍 Keywords Explanation:
   - 'thanks' → -0.3904 (↓ legitimate)
   - 'hi' → -0.1865 (↓ legitimate)
   - 'professional' → 0.1767 (↑ phishing)
   - 'start' → 0.0855 (↑ phishing)
   - 'network' → 0.0519 (↑ phishing)

🧠 Theoretical Explanation:
This email was classified

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

pd.read_csv('CEAS_08.csv')  # columns: 'subject', 'body'

# Add labels
legitimate_emails['label'] = 0
phishing_emails['label'] = 1

# Merge datasets
emails_df = pd.concat([legitimate_emails, phishing_emails]).reset_index(drop=True)

# Combine subject and body
emails_df['combined_text'] = emails_df['subject'] + ' ' + emails_df['body']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    emails_df['combined_text'], emails_df['label'], test_size=0.25, random_state=42)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Legitimate', 'Phishing']))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Function to classify new emails
def classify_email(subject, body):
    combined_text = subject + ' ' + body
    vectorized_text = vectorizer.transform([combined_text])
    prediction = model.predict(vectorized_text)[0]
    confidence_score = model.predict_proba(vectorized_text).max()
    label = 'Phishing' if prediction == 1 else 'Legitimate'
    return label, confidence_score

# Example usage
subject_example = "Your Account Requires Immediate Attention"
body_example = "Please click the link below to verify your account information urgently."
label, confidence = classify_email(subject_example, body_example)
print(f"The email is {label} with {confidence*100:.2f}% confidence.")


NameError: name 'legitimate_emails' is not defined

In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
emails_df = pd.read_csv('CEAS_08.csv')  # columns: 'subject', 'body', 'label'

# Check for necessary columns
assert {'subject', 'body', 'label'}.issubset(emails_df.columns), "Dataset must contain 'subject', 'body', and 'label' columns."

# Combine subject and body
emails_df['combined_text'] = emails_df['subject'] + ' ' + emails_df['body']

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    emails_df['combined_text'], emails_df['label'], test_size=0.25, random_state=42
)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Legitimate', 'Phishing']))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Function to classify new emails
def classify_email(subject, body):
    combined_text = subject + ' ' + body
    vectorized_text = vectorizer.transform([combined_text])
    prediction = model.predict(vectorized_text)[0]
    confidence_score = model.predict_proba(vectorized_text).max()
    label = 'Phishing' if prediction == 1 else 'Legitimate'
    return label, confidence_score

# Example usage
subject_example = "Your Account Requires Immediate Attention"
body_example = "Please click the link below to verify your account information urgently."
label, confidence = classify_email(subject_example, body_example)
print(f"The email is {label} with {confidence*100:.2f}% confidence.")


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
emails_df = pd.read_csv('CEAS_08.csv')  # columns: 'subject', 'body', 'label'

# Ensure required columns exist
assert {'subject', 'body', 'label'}.issubset(emails_df.columns), \
    "Dataset must contain 'subject', 'body', and 'label' columns."

# Handle missing values by replacing NaN with empty strings
emails_df['subject'] = emails_df['subject'].fillna('')
emails_df['body'] = emails_df['body'].fillna('')

# Combine subject and body
emails_df['combined_text'] = emails_df['subject'] + ' ' + emails_df['body']

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    emails_df['combined_text'], emails_df['label'], test_size=0.25, random_state=42
)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Legitimate', 'Phishing']))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Function to classify new emails
def classify_email(subject, body):
    combined_text = subject + ' ' + body
    vectorized_text = vectorizer.transform([combined_text])
    prediction = model.predict(vectorized_text)[0]
    confidence_score = model.predict_proba(vectorized_text).max()
    label = 'Phishing' if prediction == 1 else 'Legitimate'
    return label, confidence_score

# Example usage
subject_example = "Meeting Agenda for Tomorrow’s Project Update"
body_example = "Hi Team,I’ve attached the agenda for tomorrow’s project status meeting. We’ll discuss the current progress, upcoming deadlines, and address any questions or concerns."
label, confidence = classify_email(subject_example, body_example)
print(f"The email is {label} with {confidence*100:.2f}% confidence.")


Classification Report:
               precision    recall  f1-score   support

  Legitimate       0.99      0.99      0.99      4344
    Phishing       0.99      0.99      0.99      5445

    accuracy                           0.99      9789
   macro avg       0.99      0.99      0.99      9789
weighted avg       0.99      0.99      0.99      9789

Accuracy: 0.9928491163550924
The email is Legitimate with 89.65% confidence.


In [13]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
emails_df = pd.read_csv('CEAS_08.csv')  # columns: 'subject', 'body', 'label'

# Handle missing values
emails_df['subject'] = emails_df['subject'].fillna('')
emails_df['body'] = emails_df['body'].fillna('')

# Combine subject and body
emails_df['combined_text'] = emails_df['subject'] + ' ' + emails_df['body']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    emails_df['combined_text'], emails_df['label'], test_size=0.25, random_state=42)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Legitimate', 'Phishing']))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Function to classify new emails
def classify_email(subject, body):
    combined_text = subject + ' ' + body
    vectorized_text = vectorizer.transform([combined_text])
    prediction = model.predict(vectorized_text)[0]
    confidence_score = model.predict_proba(vectorized_text).max()
    label = 'Phishing' if prediction == 1 else 'Legitimate'
    return label, confidence_score

# Example usage
examples = [
    {"subject": "Meeting Agenda for Tomorrow’s Project Update",
     "body": "Hi Team,\n\nAttached is the agenda for tomorrow’s project meeting. We'll cover progress and upcoming deadlines.\n\nRegards, Alex."},

    {"subject": "Your account will be suspended immediately!",
     "body": "We detected suspicious activities. Click here urgently to verify and secure your account."},

    {"subject": "Invoice for Your Recent Purchase",
     "body": "Thank you for your purchase. Please find the attached invoice for your records."},

    {"subject": "Action Required: Update Your Payment Information",
     "body": "Your billing information needs updating. Follow the link provided to avoid disruption of services."}
]

for email in examples:
    label, confidence = classify_email(email["subject"], email["body"])
    print(f"Subject: {email['subject']}")
    print(f"Body: {email['body']}")
    print(f"Classification: {label} ({confidence*100:.2f}% confidence)\n")


Classification Report:
               precision    recall  f1-score   support

  Legitimate       0.99      0.99      0.99      4344
    Phishing       0.99      0.99      0.99      5445

    accuracy                           0.99      9789
   macro avg       0.99      0.99      0.99      9789
weighted avg       0.99      0.99      0.99      9789

Accuracy: 0.9928491163550924
Subject: Meeting Agenda for Tomorrow’s Project Update
Body: Hi Team,

Attached is the agenda for tomorrow’s project meeting. We'll cover progress and upcoming deadlines.

Regards, Alex.
Classification: Legitimate (78.48% confidence)

Subject: Your account will be suspended immediately!
Body: We detected suspicious activities. Click here urgently to verify and secure your account.
Classification: Phishing (86.75% confidence)

Subject: Invoice for Your Recent Purchase
Body: Thank you for your purchase. Please find the attached invoice for your records.
Classification: Phishing (65.22% confidence)

Subject: Action R

In [15]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
emails_df = pd.read_csv('CEAS_08.csv')  # columns: 'subject', 'body', 'label'

# Handle missing values
emails_df['subject'] = emails_df['subject'].fillna('')
emails_df['body'] = emails_df['body'].fillna('')

# Combine subject and body
emails_df['combined_text'] = emails_df['subject'] + ' ' + emails_df['body']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    emails_df['combined_text'], emails_df['label'], test_size=0.25, random_state=42)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Legitimate', 'Phishing']))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Function to classify new emails
def classify_email(subject, body):
    combined_text = subject + ' ' + body
    vectorized_text = vectorizer.transform([combined_text])
    prediction = model.predict(vectorized_text)[0]
    confidence_score = model.predict_proba(vectorized_text).max()
    label = 'Phishing' if prediction == 1 else 'Legitimate'
    explanation = ""
    if label == 'Phishing':
        explanation = "The email is likely phishing because it uses urgent language, requests immediate action, and may contain suspicious links."
    else:
        explanation = "The email appears legitimate as it contains routine and professional communication with no urgent requests or suspicious links."
    return label, confidence_score, explanation

# Example usage
examples = [
    {"subject": "Meeting Agenda for Tomorrow’s Project Update",
     "body": "Hi Team,\n\nAttached is the agenda for tomorrow’s project meeting. We'll cover progress and upcoming deadlines.\n\nRegards, Alex."},

    {"subject": "Your account will be suspended immediately!",
     "body": "We detected suspicious activities. Click here urgently to verify and secure your account."},

    {"subject": "Invoice for Your Recent Purchase",
     "body": "Thank you for your purchase. Please find the attached invoice for your records."},

    {"subject": "Action Required: Update Your Payment Information",
     "body": "Your billing information needs updating. Follow the link provided to avoid disruption of services."}
]

for email in examples:
    label, confidence, explanation = classify_email(email["subject"], email["body"])
    print(f"Subject: {email['subject']}")
    print(f"Body: {email['body']}")
    print(f"Classification: {label} ({confidence*100:.2f}% confidence)")
    print(f"Explanation: {explanation}\n")


Classification Report:
               precision    recall  f1-score   support

  Legitimate       0.99      0.99      0.99      4344
    Phishing       0.99      0.99      0.99      5445

    accuracy                           0.99      9789
   macro avg       0.99      0.99      0.99      9789
weighted avg       0.99      0.99      0.99      9789

Accuracy: 0.9928491163550924
Subject: Meeting Agenda for Tomorrow’s Project Update
Body: Hi Team,

Attached is the agenda for tomorrow’s project meeting. We'll cover progress and upcoming deadlines.

Regards, Alex.
Classification: Legitimate (78.48% confidence)
Explanation: The email appears legitimate as it contains routine and professional communication with no urgent requests or suspicious links.

Subject: Your account will be suspended immediately!
Body: We detected suspicious activities. Click here urgently to verify and secure your account.
Classification: Phishing (86.75% confidence)
Explanation: The email is likely phishing because i

In [17]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
emails_df = pd.read_csv('CEAS_08.csv')  # columns: 'subject', 'body', 'label'

# Handle missing values
emails_df['subject'] = emails_df['subject'].fillna('')
emails_df['body'] = emails_df['body'].fillna('')

# Combine subject and body
emails_df['combined_text'] = emails_df['subject'] + ' ' + emails_df['body']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    emails_df['combined_text'], emails_df['label'], test_size=0.25, random_state=42)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Legitimate', 'Phishing']))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Function to classify new emails
def classify_email(subject, body):
    combined_text = subject + ' ' + body
    vectorized_text = vectorizer.transform([combined_text])
    prediction = model.predict(vectorized_text)[0]
    confidence_score = model.predict_proba(vectorized_text).max()
    label = 'Phishing' if prediction == 1 else 'Legitimate'

    explanation = []
    if "urgent" in combined_text.lower() or "immediately" in combined_text.lower():
        explanation.append("⚠️ The email contains urgent language.")
    if "verify" in combined_text.lower() or "click here" in combined_text.lower():
        explanation.append("⚠️ The email requests immediate action through links.")
    if "invoice" in combined_text.lower() or "attached" in combined_text.lower():
        explanation.append("✅ The email references an attachment (common in legitimate business communications).")
    if "suspended" in combined_text.lower() or "secure your account" in combined_text.lower():
        explanation.append("⚠️ The email contains threatening language about account security.")
    if not explanation:
        explanation.append("✅ No immediate phishing indicators were found.")

    full_explanation = " ".join(explanation)

    return label, confidence_score, full_explanation

# Example usage
examples = [
    {"subject": "Meeting Agenda for Tomorrow’s Project Update",
     "body": "Hi Team,\n\nAttached is the agenda for tomorrow’s project meeting. We'll cover progress and upcoming deadlines.\n\nRegards, Alex."},

    {"subject": "Your account will be suspended immediately!",
     "body": "We detected suspicious activities. Click here urgently to verify and secure your account."},

    {"subject": "Invoice for Your Recent Purchase",
     "body": "Thank you for your purchase. Please find the attached invoice for your records."},

    {"subject": "Action Required: Update Your Payment Information",
     "body": "Your billing information needs updating. Follow the link provided to avoid disruption of services."}
]

for email in examples:
    label, confidence, explanation = classify_email(email["subject"], email["body"])
    print(f"Subject: {email['subject']}")
    print(f"Body: {email['body']}")
    print(f"Classification: {label} ({confidence*100:.2f}% confidence)")
    print(f"Explanation: {explanation}\n")

Classification Report:
               precision    recall  f1-score   support

  Legitimate       0.99      0.99      0.99      4344
    Phishing       0.99      0.99      0.99      5445

    accuracy                           0.99      9789
   macro avg       0.99      0.99      0.99      9789
weighted avg       0.99      0.99      0.99      9789

Accuracy: 0.9928491163550924
Subject: Meeting Agenda for Tomorrow’s Project Update
Body: Hi Team,

Attached is the agenda for tomorrow’s project meeting. We'll cover progress and upcoming deadlines.

Regards, Alex.
Classification: Legitimate (78.48% confidence)
Explanation: ✅ The email references an attachment (common in legitimate business communications).

Subject: Your account will be suspended immediately!
Body: We detected suspicious activities. Click here urgently to verify and secure your account.
Classification: Phishing (86.75% confidence)
Explanation: ⚠️ The email contains urgent language. ⚠️ The email requests immediate action thr