In [13]:
%pip install pandas numpy scikit-learn joblib matplotlib seaborn nltk wordcloud google-api-python-client google-auth-httplib2 google-auth-oauthlib

Note: you may need to restart the kernel to use updated packages.


In [53]:
from base64 import urlsafe_b64encode, urlsafe_b64decode

In [55]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from urllib.parse import urlparse
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
# 2.1 Load and clean URL dataset
df_urls = pd.read_csv('phishing_site_urls.csv', names=['URL','Label'], header=0)
df_urls['Label'] = df_urls['Label'].str.lower()
df_urls = df_urls[df_urls['Label'].isin(['good','bad'])]

# quick balance
y_min = df_urls['Label'].value_counts().min()
from sklearn.utils import resample
balanced = pd.concat([
    resample(df_urls[df_urls.Label=='good'], n_samples=y_min, random_state=42),
    resample(df_urls[df_urls.Label=='bad'],  n_samples=y_min, random_state=42)
])

# 2.2 Clean URLs for vectorization
def clean_url(url):
    p = urlparse(url)
    domain = p.netloc.lower().replace('www.', '')
    path = p.path.replace('/', ' ').lower()
    return domain + ' ' + path

balanced['clean_url'] = balanced['URL'].apply(clean_url)

# 2.3 Vectorize and train
tX = balanced['clean_url']
ty = balanced['Label'].map({'good':0,'bad':1})

vectorizer_url = CountVectorizer()
X_vec = vectorizer_url.fit_transform(tX)
X_tr, X_te, y_tr, y_te = train_test_split(X_vec, ty, stratify=ty, random_state=42)
url_model = LogisticRegression(max_iter=2000)
url_model.fit(X_tr, y_tr)

# save URL model and vectorizer
joblib.dump(url_model, 'phishing_model_lr.pkl')
joblib.dump(vectorizer_url, 'vectorizer.pkl')

['vectorizer.pkl']

In [58]:
print("URL Model Accuracy:", url_model.score(X_te, y_te))
print(classification_report(y_te, url_model.predict(X_te), target_names=['legit','phish']))

URL Model Accuracy: 0.9651455677590108
              precision    recall  f1-score   support

       legit       0.96      0.97      0.97     39106
       phish       0.97      0.96      0.96     39105

    accuracy                           0.97     78211
   macro avg       0.97      0.97      0.97     78211
weighted avg       0.97      0.97      0.97     78211



In [59]:
# 3.1 Load email dataset
df_email = pd.read_csv('phishing_email.csv')  # expects columns 'text_combined','label'
df_email = df_email.dropna(subset=['text_combined','label']).copy()
df_email['label'] = df_email['label'].astype(int)

# 3.2 Train email pipeline
Xe = df_email['text_combined']
ye = df_email['label']
Xe_tr, Xe_te, ye_tr, ye_te = train_test_split(Xe, ye, stratify=ye, random_state=42)
email_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    LogisticRegression(max_iter=1000)
)
email_pipe.fit(Xe_tr, ye_tr)
joblib.dump(email_pipe, 'email_model.pkl')

['email_model.pkl']

In [60]:
# 3.3 Evaluate email model
print("Email Model Accuracy:", email_pipe.score(Xe_te, ye_te))
print(classification_report(ye_te, email_pipe.predict(Xe_te), target_names=['legit','phish']))

Email Model Accuracy: 0.9867617107942973
              precision    recall  f1-score   support

       legit       0.99      0.98      0.99      9899
       phish       0.98      0.99      0.99     10723

    accuracy                           0.99     20622
   macro avg       0.99      0.99      0.99     20622
weighted avg       0.99      0.99      0.99     20622



In [61]:
# 4.1 Load trained models
email_model = joblib.load('email_model.pkl')
url_model   = joblib.load('phishing_model_lr.pkl')
url_vec     = joblib.load('vectorizer.pkl')

# 4.2 Define detector
url_pattern = re.compile(r'http[s]?://\S+')

def combined_detector(text: str,
                      email_thresh: float = 0.7,
                      url_thresh: float   = 0.7) -> str:
    """
    Returns 'phishing' if:
      - email_model‚Äôs phishing probability ‚â• email_thresh, OR
      - any URL‚Äôs phishing probability ‚â• url_thresh
    Otherwise returns 'legitimate'
    """
    # 1) Email model confidence
    em_prob = email_model.predict_proba([text])[0][1]

    # 2) Check URL model confidence for all URLs
    max_url_prob = 0.0
    for u in url_pattern.findall(text):
        cu = clean_url(u)
        url_prob = url_model.predict_proba(url_vec.transform([cu]))[0][1]
        max_url_prob = max(max_url_prob, url_prob)

    if em_prob >= email_thresh or max_url_prob >= url_thresh:
        return 'phishing'
    return 'legitimate'

In [62]:
test_email = "Subject: Test\nHello, visit http://secure-login.biz/verify"
print("Test prediction:", combined_detector(test_email))

Test prediction: phishing


In [63]:
y_true = df_email['label']
y_pred = df_email['text_combined'].apply(lambda t: 1 if combined_detector(t)=='phishing' else 0)
print("Combined Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred, target_names=['legit','phish']))

Combined Accuracy: 0.9965933612976747
              precision    recall  f1-score   support

       legit       1.00      1.00      1.00     39595
       phish       1.00      1.00      1.00     42891

    accuracy                           1.00     82486
   macro avg       1.00      1.00      1.00     82486
weighted avg       1.00      1.00      1.00     82486



In [29]:
%pip install --upgrade google-auth-oauthlib google-api-python-client

Note: you may need to restart the kernel to use updated packages.


In [30]:
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
import base64
from email import message_from_bytes

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

def authenticate_gmail():
    creds = None
    if os.path.exists('token.json'):
        from google.oauth2.credentials import Credentials
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
        creds = flow.run_local_server(port=8080)
        with open('token.json','w') as f:
            f.write(creds.to_json())
    return build('gmail','v1',credentials=creds)

In [31]:
from urllib.parse import urlparse

# Your trained model and vectorizer must be loaded:
# url_model = joblib.load("phishing_model_lr.pkl")
# url_vec = joblib.load("vectorizer.pkl")

# üîπ Step 1: Define the URL and cleaning function
url = "https://www.shiksha.com/college/symbiosis-medical-college-for-women-symbiosis-international-pune-lavale-146023/course-bachelor-of-medicine-and-bachelor-of-surgery-mbbs-528225"

def clean_url(url):
    parsed = urlparse(url)
    domain = parsed.netloc.lower().replace("www.", "")
    path = parsed.path.replace("/", " ").lower()
    return f"{domain} {path}"

# üîπ Step 2: Clean, vectorize, predict
cleaned = clean_url(url)
vectorized = url_vec.transform([cleaned])
prediction = url_model.predict(vectorized)[0]
confidence = url_model.predict_proba(vectorized)[0][1]

# üîπ Step 3: Show result
print(f"üîó URL: {url}")
print(f"üßπ Cleaned: {cleaned}")
print(f"üîç Confidence (phishing): {confidence:.2f}")
print("üö´ RESULT: PHISHING" if prediction == 'bad' or confidence > 0.95 else "‚úÖ RESULT: LEGITIMATE")


üîó URL: https://www.shiksha.com/college/symbiosis-medical-college-for-women-symbiosis-international-pune-lavale-146023/course-bachelor-of-medicine-and-bachelor-of-surgery-mbbs-528225
üßπ Cleaned: shiksha.com  college symbiosis-medical-college-for-women-symbiosis-international-pune-lavale-146023 course-bachelor-of-medicine-and-bachelor-of-surgery-mbbs-528225
üîç Confidence (phishing): 0.00
‚úÖ RESULT: LEGITIMATE


In [32]:
service = authenticate_gmail()
print("Gmail service:", service)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=836217650067-20p6cg52pnbaccm9q9bgmphtkc4l08mj.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.readonly&state=Uw3t19yAjzB3SZsYcXOZ0REBCS1YXj&access_type=offline
Gmail service: <googleapiclient.discovery.Resource object at 0x000001E88EE62DB0>


In [85]:
def is_auto_generated_email(text):
    text = text.lower()
    keywords = [
        "do not reply", "automated message", "this is an auto-generated email",
        "dear customer", "thank you for using", "your transaction id is",
        "upi debit alert", "download now", "payment received",
        "anaconda distribution", "login from new device"
    ]
    matches = [kw for kw in keywords if kw in text]
    return len(matches) >= 2  # You can adjust this threshold

In [95]:
messages = service.users().messages().list(userId='me', maxResults=1).execute().get('messages', [])

In [97]:
for m in messages:
    try:
        msg_raw = service.users().messages().get(userId='me', id=m['id'], format='raw').execute()['raw']
        data = urlsafe_b64decode(msg_raw.encode())
        email_message = message_from_bytes(data)

        subject = email_message['Subject'] or ''
        body = ''

        if email_message.is_multipart():
            for part in email_message.walk():
                if part.get_content_type() == 'text/plain':
                    body = part.get_payload(decode=True).decode(errors='ignore')
                    break
        else:
            body = email_message.get_payload(decode=True).decode(errors='ignore')

        full_text = subject + '\n' + body

        # Email model prediction
        email_prob = email_model.predict_proba([full_text])[0][1]
        email_percent = round(email_prob * 100, 2)

        # Heuristic logic
        heuristic_safe = is_auto_generated_email(full_text)
        email_flagged = (email_prob > 0.90) and not heuristic_safe

        # Extract URLs from text
        urls = re.findall(r'(https?://[^\s]+)', full_text)
        url_probs = []
        for url in urls:
            cleaned = clean_url(url)
            vec = url_vec.transform([cleaned])
            prob = url_model.predict_proba(vec)[0][1]
            url_probs.append((url, round(prob * 100, 2)))

        max_url_prob = max([p for u, p in url_probs], default=0)
        url_flagged = max_url_prob > 98.0

        is_phishing = email_flagged or url_flagged

        # Output section
        print("üìß EMAIL PREVIEW:\n", full_text[:300])
        print(f"üß† Email Phishing Possibility: {email_percent}% (Threshold: 90%) {'üö©' if email_flagged else '‚úÖ'}")
        if heuristic_safe:
            print("üîç Heuristic override: Detected safe auto-generated email ‚úÖ")
        if urls:
            for u, p in url_probs:
                print(f"üåê URL: {u} ‚Üí Phishing Possibility: {p}% (Threshold: 95%) {'üö©' if p > 95 else '‚úÖ'}")
        else:
            print("üåê No URLs found.")

        print("üîç FINAL RESULT:", "PHISHING üö´" if is_phishing else "LEGITIMATE ‚úÖ")
        print("‚Äî" * 50)

    except Exception as e:
        print("‚ö†Ô∏è Skipped message due to error:", e)

üìß EMAIL PREVIEW:
 UPI Debit Alert
Dear Customer,

An amount of INR 164.00 has been debited to your A/C. No. XX0061 on 23-APR-25- on account of UPI:511328882759/To:q245978712@ybl

Available Balance on 23-APR-2025 18:06:15 is INR 4,403.67 and Total available balance (including linked deposits and Limit) is INR 4,40
üß† Email Phishing Possibility: 99.94% (Threshold: 90%) ‚úÖ
üîç Heuristic override: Detected safe auto-generated email ‚úÖ
üåê URL: https://delivery.yesbank.in/lt.pl?id=15556=dkgGUwdQDFYHTwUIAlNSXwdWB1QEVQIFUA5WVlcAAwUNUlUCBwENVgYHVFsFAFRTUFYeAl0QQFRYUF4QSwVVWgQBAiNRCFgKXR0HVlwaAVEBVlFQClYBBA8HBFILVgBPXUBCE0VfFkxVVghQRwNCHxsaBxZQBFxeGF5ZS14QHENZC18HShMK&fl=XUBCEwxKFhRGREpAVBVSB1sITAxc ‚Üí Phishing Possibility: 99.72% (Threshold: 95%) üö©
üîç FINAL RESULT: PHISHING üö´
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
