In [1]:
import pandas as pd

# Load your dataset (replace with your file path if not already loaded)
df = pd.read_csv("CEAS_08.csv")

# Display the first few rows of the 'sender' column
print(df['sender'].head())


0                     Young Esposito <Young@iworld.de>
1                         Mok <ipline's1983@icable.ph>
2    Daily Top 10 <Karmandeep-opengevl@universalnet...
3                   Michael Parker <ivqrnai@pobox.com>
4    Gretchen Suggs <externalsep1@loanofficertool.com>
Name: sender, dtype: object


In [3]:
df['sender_email'] = df['sender'].str.extract(r'<(.*?)>')


In [5]:
df['sender_domain'] = df['sender_email'].str.split('@').str[1]


In [9]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Step 1: Extract sender email and domain
df['sender_email'] = df['sender'].str.extract(r'<(.*?)>')
df['sender_domain'] = df['sender_email'].str.split('@').str[1]
df = df.dropna(subset=['sender_domain'])

# Step 2: Vectorize the sender domains
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['sender_domain'])
y = df['label']

# Step 3: Train a simple model
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Step 4: Explanation function
def explain_domain(domain):
    domain_vec = vectorizer.transform([domain])
    prediction = model.predict(domain_vec)[0]
    proba = model.predict_proba(domain_vec)[0][prediction]

    feature_names = vectorizer.get_feature_names_out()
    importance_scores = model.feature_importances_

    domain_parts = domain.split('.')
    keywords = []
    for part in domain_parts:
        if part in feature_names:
            idx = list(feature_names).index(part)
            score = importance_scores[idx]
            keywords.append((part, round(score, 4)))

    keywords = sorted(keywords, key=lambda x: -x[1])[:5]

    print("\nüì® Domain:", domain)
    print("üîé Prediction:", "Phishing üö®" if prediction == 1 else "Legitimate ‚úÖ")
    print("üìä Confidence Score:", round(proba, 4))
    print("üìå Influential keywords:", ", ".join([f"{w[0]} (score={w[1]})" for w in keywords]) if keywords else "No clear indicators")

# üîç Test on a few domains from your df
for domain in df['sender_domain'].sample(3, random_state=1):
    explain_domain(domain)



üì® Domain: caswellmassey.com
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.9191
üìå Influential keywords: com (score=0.2414)

üì® Domain: dsi.unimi.it
üîé Prediction: Legitimate ‚úÖ
üìä Confidence Score: 1.0
üìå Influential keywords: it (score=0.0042), dsi (score=0.0004), unimi (score=0.0003)

üì® Domain: factset.com
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.9191
üìå Influential keywords: com (score=0.2414)


In [11]:
explain_domain("dsi.unimi.it")



üì® Domain: dsi.unimi.it
üîé Prediction: Legitimate ‚úÖ
üìä Confidence Score: 1.0
üìå Influential keywords: it (score=0.0042), dsi (score=0.0004), unimi (score=0.0003)


In [3]:
phishing_keywords = ['login', 'secure', 'verify', 'update', 'account', 'payment', 'signin']

def has_phishing_keyword(domain):
    return int(any(kw in domain.lower() for kw in phishing_keywords))

df['phishing_keyword_present'] = df['sender_domain'].apply(has_phishing_keyword)


In [5]:
X_tfidf = vectorizer.fit_transform(df['sender_domain'])
X_combined = np.hstack((X_tfidf.toarray(), df['phishing_keyword_present'].values.reshape(-1, 1)))

model.fit(X_combined, y)


In [25]:
def explain_domain(domain):
    domain_vec = vectorizer.transform([domain])

    # Ensure shape matches training data
    if domain_vec.shape[1] != X_tfidf.shape[1]:
        print(f"\nERROR: Domain TF-IDF has {domain_vec.shape[1]} features but Model expects {X_tfidf.shape[1]}")
        return

    kw_present = has_phishing_keyword(domain)
    domain_features = np.hstack((domain_vec.toarray(), [[kw_present]]))

    prediction = model.predict(domain_features)[0]
    proba = model.predict_proba(domain_features)[0][prediction]

    print("\nüì® Domain:", domain)
    print("üîé Prediction:", "Phishing üö®" if prediction == 1 else "Legitimate ‚úÖ")
    print("üìä Confidence Score:", round(proba, 4))
    print("‚ö†Ô∏è Phishing Keyword Detected:", 'Yes' if kw_present else 'No')


In [34]:
phishing_keywords = ['login', 'secure', 'verify', 'update', 'account', 'payment', 'signin']


In [36]:
def has_phishing_keyword(domain):
    return int(any(kw in domain.lower() for kw in phishing_keywords))


In [38]:
df['phishing_keyword_present'] = df['sender_domain'].apply(has_phishing_keyword)


In [40]:
# 1. Define keywords
phishing_keywords = ['login', 'secure', 'verify', 'update', 'account', 'payment', 'signin']

# 2. Define function
def has_phishing_keyword(domain):
    return int(any(kw in domain.lower() for kw in phishing_keywords))

# 3. Apply to dataframe
df['phishing_keyword_present'] = df['sender_domain'].apply(has_phishing_keyword)

# 4. Vectorize
X_tfidf = vectorizer.fit_transform(df['sender_domain'])

# 5. Combine Features
X_combined = np.hstack((X_tfidf.toarray(), df['phishing_keyword_present'].values.reshape(-1, 1)))

# 6. Train Model
model.fit(X_combined, y)


In [42]:
# List of domains to test
test_domains = [
    'google.com',
    'paypal.com',
    'amaz0n-secure.com',
    'pay-pal-support.net',
    'secure-hsbc-login.com',
    'apple-login.support',
    'linkedin.com',
    'googl3-login.com',
    'account-update-facebook.net',
    'microsoft-verify-login.net'
]

# Function to check domains one by one
for domain in test_domains:
    domain_vec = vectorizer.transform([domain]).toarray()

    # Padding or trimming in case of shape mismatch
    if domain_vec.shape[1] < X_tfidf.shape[1]:
        domain_vec = np.pad(domain_vec, ((0, 0), (0, X_tfidf.shape[1] - domain_vec.shape[1])), mode='constant')
    elif domain_vec.shape[1] > X_tfidf.shape[1]:
        domain_vec = domain_vec[:, :X_tfidf.shape[1]]

    kw_present = has_phishing_keyword(domain)
    domain_features = np.hstack((domain_vec, [[kw_present]]))

    prediction = model.predict(domain_features)[0]
    proba = model.predict_proba(domain_features)[0][prediction]

    print("\nüì® Domain:", domain)
    print("üîé Prediction:", "Phishing üö®" if prediction == 1 else "Legitimate ‚úÖ")
    print("üìä Confidence Score:", round(proba, 4))
    print("‚ö†Ô∏è Phishing Keyword Detected:", 'Yes' if kw_present else 'No')



üì® Domain: google.com
üîé Prediction: Legitimate ‚úÖ
üìä Confidence Score: 0.52
‚ö†Ô∏è Phishing Keyword Detected: No

üì® Domain: paypal.com
üîé Prediction: Legitimate ‚úÖ
üìä Confidence Score: 0.52
‚ö†Ô∏è Phishing Keyword Detected: No

üì® Domain: amaz0n-secure.com
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.57
‚ö†Ô∏è Phishing Keyword Detected: Yes

üì® Domain: pay-pal-support.net
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.87
‚ö†Ô∏è Phishing Keyword Detected: No

üì® Domain: secure-hsbc-login.com
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.57
‚ö†Ô∏è Phishing Keyword Detected: Yes

üì® Domain: apple-login.support
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.75
‚ö†Ô∏è Phishing Keyword Detected: Yes

üì® Domain: linkedin.com
üîé Prediction: Legitimate ‚úÖ
üìä Confidence Score: 0.52
‚ö†Ô∏è Phishing Keyword Detected: No

üì® Domain: googl3-login.com
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.54
‚ö†Ô∏è Phishing Key

In [44]:
def explain_domain(domain):
    domain_vec = vectorizer.transform([domain]).toarray()

    # Handle mismatch in shape
    if domain_vec.shape[1] < X_tfidf.shape[1]:
        domain_vec = np.pad(domain_vec, ((0, 0), (0, X_tfidf.shape[1] - domain_vec.shape[1])), mode='constant')
    elif domain_vec.shape[1] > X_tfidf.shape[1]:
        domain_vec = domain_vec[:, :X_tfidf.shape[1]]

    kw_present = has_phishing_keyword(domain)
    domain_features = np.hstack((domain_vec, [[kw_present]]))

    prediction = model.predict(domain_features)[0]
    proba = model.predict_proba(domain_features)[0][prediction]

    print("\nüì® Domain:", domain)
    print("üîé Prediction:", "Phishing üö®" if prediction == 1 else "Legitimate ‚úÖ")
    print("üìä Confidence Score:", round(proba, 4))
    print("‚ö†Ô∏è Phishing Keyword Detected:", 'Yes' if kw_present else 'No')

    # Extra Explanation for Phishing
    if prediction == 1:
        reasons = []

        # Reason 1: Suspicious Keyword
        for kw in phishing_keywords:
            if kw in domain.lower():
                reasons.append(f"Contains suspicious keyword: '{kw}'")

        # Reason 2: Similar-Looking Characters
        if any(char.isdigit() for char in domain):
            reasons.append("Contains numbers which may replace letters (common phishing trick)")

        # Reason 3: Long or Unusual Structure
        if len(domain) > 20:
            reasons.append("Domain length is unusually long")

        # Reason 4: Hyphen Usage
        if '-' in domain:
            reasons.append("Contains hyphen '-' which attackers use to mimic original domains")

        # Reason 5: Multiple Subdomains
        if domain.count('.') >= 3:
            reasons.append("Has multiple subdomains making it look complex/confusing")

        print("\nüìù Why this domain is classified as Phishing:")
        for r in reasons:
            print("-", r)

        if not reasons:
            print("- Patterns in domain matched previous phishing domains seen during training")



In [49]:
# List of domains i want to test
test_domains = [
    'google.com',
    'paypal.com',
    'amaz0n-secure.com',
    'pay-pal-support.net',
    'secure-hsbc-login.com',
    'apple-login.support',
    'linkedin.com',
    'googl3-login.com',
    'account-update-facebook.net',
    'microsoft-verify-login.net'
]

# Loop to test all domains
for domain in test_domains:
    domain_vec = vectorizer.transform([domain]).toarray()

    # Padding or trimming to avoid shape mismatch
    if domain_vec.shape[1] < X_tfidf.shape[1]:
        domain_vec = np.pad(domain_vec, ((0, 0), (0, X_tfidf.shape[1] - domain_vec.shape[1])), mode='constant')
    elif domain_vec.shape[1] > X_tfidf.shape[1]:
        domain_vec = domain_vec[:, :X_tfidf.shape[1]]

    kw_present = has_phishing_keyword(domain)
    domain_features = np.hstack((domain_vec, [[kw_present]]))

    prediction = model.predict(domain_features)[0]
    proba = model.predict_proba(domain_features)[0][prediction]

    print("\nüì® Domain:", domain)
    print("üîé Prediction:", "Phishing üö®" if prediction == 1 else "Legitimate ‚úÖ")
    print("üìä Confidence Score:", round(proba, 4))
    print("‚ö†Ô∏è Phishing Keyword Detected:", 'Yes' if kw_present else 'No')

    if prediction == 1:
        reasons = []

        # Keyword check
        for kw in phishing_keywords:
            if kw in domain.lower():
                reasons.append(f"Contains suspicious keyword: '{kw}'")

        # Numbers in domain
        if any(char.isdigit() for char in domain):
            reasons.append("Contains numbers which may replace letters (common phishing trick)")

        # Long domain length
        if len(domain) > 20:
            reasons.append("Domain length is unusually long")

        # Hyphen usage
        if '-' in domain:
            reasons.append("Contains hyphen '-' which attackers use to mimic original domains")

        # Too many subdomains
        if domain.count('.') >= 3:
            reasons.append("Has multiple subdomains making it look complex/confusing")

        print("\nüìù Why this domain is classified as Phishing:")
        for r in reasons:
            print("-", r)

        if not reasons:
            print("- Patterns in domain matched previous phishing domains seen during training")



üì® Domain: google.com
üîé Prediction: Legitimate ‚úÖ
üìä Confidence Score: 0.52
‚ö†Ô∏è Phishing Keyword Detected: No

üì® Domain: paypal.com
üîé Prediction: Legitimate ‚úÖ
üìä Confidence Score: 0.52
‚ö†Ô∏è Phishing Keyword Detected: No

üì® Domain: amaz0n-secure.com
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.57
‚ö†Ô∏è Phishing Keyword Detected: Yes

üìù Why this domain is classified as Phishing:
- Contains suspicious keyword: 'secure'
- Contains numbers which may replace letters (common phishing trick)
- Contains hyphen '-' which attackers use to mimic original domains

üì® Domain: pay-pal-support.net
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.87
‚ö†Ô∏è Phishing Keyword Detected: No

üìù Why this domain is classified as Phishing:
- Contains hyphen '-' which attackers use to mimic original domains

üì® Domain: secure-hsbc-login.com
üîé Prediction: Phishing üö®
üìä Confidence Score: 0.57
‚ö†Ô∏è Phishing Keyword Detected: Yes

üìù Why this domain