In [4]:
import pandas as pd
import numpy as np
import re
import tldextract
from urllib.parse import urlparse
import whois
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
df = pd.read_csv("../data/phising.csv")

useful_feature = ['SSLfinal_State', 'URL_of_Anchor', 'Prefix_Suffix', 'web_traffic', 'having_Sub_Domain', 'Request_URL', 'Links_in_tags', 'SFH', 'Google_Index', 'age_of_domain', 'Page_Rank', 'having_IP_Address', 'Statistical_report', 'DNSRecord', 'URL_Length', 'having_At_Symbol', 'on_mouseover', 'port', 'Links_pointing_to_page', 'Redirect', 'double_slash_redirecting', 'HTTPS_token', 'Abnormal_URL', 'Shortining_Service', 'Domain_registeration_length']
X = df[useful_feature]
y = df["Result"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
model = RandomForestClassifier(n_estimators=200, random_state=42)

model.fit(X_train, y_train)

predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.9696969696969697

Classification Report:
               precision    recall  f1-score   support

          -1       0.98      0.95      0.96       956
           1       0.96      0.98      0.97      1255

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211



In [8]:
# -----------------------------
# Helper functions for discrete encoding
# -----------------------------

def having_ip_address(url):
    return 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else -1

def url_length(url):
    if len(url) < 54:
        return 1
    elif 54 <= len(url) <= 75:
        return 0
    else:
        return -1

def shortining_service(url):
    shorteners = ["bit.ly", "goo.gl", "tinyurl.com", "ow.ly", "t.co"]
    return 1 if any(s in url for s in shorteners) else -1

def having_at_symbol(url):
    return 1 if "@" in url else -1

def double_slash_redirecting(url):
    return 1 if url.count("//") > 1 else -1

def prefix_suffix(domain):
    return 1 if "-" in domain else -1

def having_sub_domain(subdomain):
    count = len(subdomain.split(".")) if subdomain else 0
    if count == 0:
        return -1
    elif count == 1:
        return 0
    else:
        return 1

def ssl_final_state(url):
    # Simplified placeholder: 1 = HTTPS, -1 = no HTTPS, 0 = unknown
    return 1 if url.lower().startswith("https") else -1

def domain_registration_length(domain):
    try:
        w = whois.whois(domain)
        if w.creation_date:
            creation = w.creation_date
            if isinstance(creation, list):
                creation = creation[0]
            age_days = (pd.Timestamp.now() - pd.Timestamp(creation)).days
            return 1 if age_days > 365 else -1
        return -1
    except:
        return -1

def port(url):
    parsed = urlparse(url)
    return 1 if parsed.port in [80, 443, None] else -1

def https_token(url):
    return 1 if url.lower().startswith("https") else -1

def abnormal_url(domain, url):
    return 1 if domain not in url else -1

# Placeholder features for external/network checks
def web_traffic_placeholder(url):
    return 0  # 1=high, 0=medium, -1=low

def google_index_placeholder(url):
    return 1  # 1=indexed, -1=not indexed

def page_rank_placeholder(domain):
    return 1

def dns_record_placeholder(domain):
    return 1

def links_pointing_to_page_placeholder(url):
    return 0

def statistical_report_placeholder(url):
    return -1

# -----------------------------
# Main feature extractor
# -----------------------------
def extract_phishing_features(url):
    features = {}
    parsed = urlparse(url)
    extracted = tldextract.extract(url)
    domain = extracted.domain
    subdomain = extracted.subdomain

    features['SSLfinal_State'] = ssl_final_state(url)
    features['URL_of_Anchor'] = 0            # placeholder
    features['Prefix_Suffix'] = prefix_suffix(domain)
    features['web_traffic'] = web_traffic_placeholder(url)
    features['having_Sub_Domain'] = having_sub_domain(subdomain)
    features['Request_URL'] = -1               # placeholder
    features['Links_in_tags'] = 0              # placeholder
    features['SFH'] = 0                        # placeholder
    features['Google_Index'] = google_index_placeholder(url)
    features['age_of_domain'] = -1             # placeholder
    features['Page_Rank'] = page_rank_placeholder(domain)
    features['having_IP_Address'] = having_ip_address(url)
    features['Statistical_report'] = statistical_report_placeholder(url)
    features['DNSRecord'] = dns_record_placeholder(domain)
    features['URL_Length'] = url_length(url)
    features['having_At_Symbol'] = having_at_symbol(url)
    features['on_mouseover'] = 1               # placeholder
    features['port'] = port(url)
    features['Links_pointing_to_page'] = links_pointing_to_page_placeholder(url)
    features['Redirect'] = 0                    # placeholder
    features['double_slash_redirecting'] = double_slash_redirecting(url)
    features['HTTPS_token'] = https_token(url)
    features['Abnormal_URL'] = abnormal_url(domain, url)
    features['Shortining_Service'] = shortining_service(url)
    features['Domain_registeration_length'] = domain_registration_length(domain)

    return pd.DataFrame([features])

In [9]:
url = "https://chatgpt.com/"
extracted_url_features = extract_phishing_features(url)

In [10]:
prediction = model.predict(extracted_url_features)

if prediction[0] == -1:
    print(" Phishing Website Detected!")
else:
    print(" Legitimate Website")

 Legitimate Website


In [11]:
import joblib

joblib.dump(model, "../models/phishing_model.pkl")
joblib.dump(X.columns.tolist(), "../models/model_columns.pkl")

print("✅ New model saved successfully")


✅ New model saved successfully
