In [None]:
# ============================================================
# HYBRID ENSEMBLE MODEL FOR TOP 20 DATASET
# (DT + ANN + KNN with SOFT VOTING)
# ============================================================

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# ============================================================
# LOAD DATASET
# ============================================================
df = pd.read_csv("/content/Dataset_useful_top20.csv")

print("Dataset shape:", df.shape)

# Target column
target_col = "Type"

# Convert to numeric
df[target_col] = df[target_col].astype("category").cat.codes

# Features and labels
X = df.drop(columns=[target_col])
y = df[target_col]

print("Feature shape:", X.shape)

# ============================================================
# TRAIN-TEST SPLIT
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ============================================================
# DEFINE BASE MODELS
# ============================================================

# Decision Tree
dt_model = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("dt", DecisionTreeClassifier(max_depth=10, random_state=42))
])

# ANN (MLP)
mlp_model = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("mlp", MLPClassifier(
        hidden_layer_sizes=(128, 64),
        max_iter=300,
        random_state=42,
        early_stopping=True
    ))
])

# KNN
knn_model = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=5))
])

# ============================================================
# HYBRID ENSEMBLE MODEL (SOFT VOTING)
# ============================================================
hybrid_model = VotingClassifier(
    estimators=[
        ("dt", dt_model),
        ("mlp", mlp_model),
        ("knn", knn_model)
    ],
    voting="soft"   # probability-based voting
)

# ============================================================
# TRAIN MODEL
# ============================================================
print("\nTraining Hybrid Model...")
hybrid_model.fit(X_train, y_train)

# ============================================================
# EVALUATION
# ============================================================
y_pred = hybrid_model.predict(X_test)
y_proba = hybrid_model.predict_proba(X_test)[:, 1]

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ============================================================
# CONFIDENCE PREDICTION FUNCTION
# ============================================================
def predict_sample(sample):

    sample = np.array(sample).reshape(1, -1)
    prob = hybrid_model.predict_proba(sample)[0][1]

    if prob >= 0.5:
        return "‚ö†Ô∏è Phishing", prob
    else:
        return "‚úÖ Legitimate", prob


# ============================================================
# TEST USING DATASET SAMPLE
# ============================================================
print("\nTesting on sample row:")
sample = X.iloc[0].values

print("Actual Label:", y.iloc[0])
print("Prediction:", predict_sample(sample))

import joblib

joblib.dump(hybrid_model, "/content/hybrid_model.pkl")

print("Model saved!")



Dataset shape: (247950, 21)
Feature shape: (247950, 20)

Training Hybrid Model...

Accuracy: 0.9192780802581165

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.95      0.92     25708
           1       0.94      0.89      0.91     23882

    accuracy                           0.92     49590
   macro avg       0.92      0.92      0.92     49590
weighted avg       0.92      0.92      0.92     49590


Testing on sample row:
Actual Label: 0
Prediction: ('‚úÖ Legitimate', np.float64(0.024253054005665967))
Model saved!




In [None]:
!pip install tldextract python-whois dnspython joblib -q



[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/105.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m105.9/105.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/117.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m117.0/117.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/331.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [

In [None]:
!git clone https://github.com/Phishing-Database/Phishing.Database.git


Cloning into 'Phishing.Database'...
remote: Enumerating objects: 15636, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 15636 (delta 3), reused 3 (delta 2), pack-reused 15628 (from 3)[K
Receiving objects: 100% (15636/15636), 1.30 GiB | 22.24 MiB/s, done.
Resolving deltas: 100% (12409/12409), done.
Updating files: 100% (69/69), done.


In [None]:
# ------------------------------------------------------------
# BLACKLIST_CHECKER.PY
# ------------------------------------------------------------

import os
from urllib.parse import urlparse

def load_blacklist_domains(path):
    domains = set()

    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), "r", encoding="utf-8", errors="ignore") as f:
                    for line in f:
                        entry = line.strip()
                        domain = urlparse(entry).netloc

                        if not domain:
                            domain = entry

                        if domain:
                            domains.add(domain.lower())

    print("Loaded domains:", len(domains))
    return domains


blacklist_domains = load_blacklist_domains("/content/Phishing.Database")

def is_blacklisted(url):
    domain = urlparse(url).netloc.lower()

    if domain.startswith("www."):
        domain = domain[4:]

    return domain in blacklist_domains



Loaded domains: 945057


In [None]:
# ------------------------------------------------------------
# FEATURE_EXTRACTOR.PY
# ------------------------------------------------------------

import re
import math
from urllib.parse import urlparse
import tldextract
from collections import Counter


# ------------------------------------------------------------
# HELPER: ENTROPY CALCULATION
# ------------------------------------------------------------
def calculate_entropy(string):
    if not string:
        return 0

    prob = [float(string.count(c)) / len(string) for c in set(string)]
    entropy = -sum([p * math.log2(p) for p in prob])

    return entropy


# ------------------------------------------------------------
# MAIN FEATURE EXTRACTION FUNCTION
# ------------------------------------------------------------
def extract_features(url):

    url = str(url)
    parsed = urlparse(url)
    ext = tldextract.extract(url)

    domain = ext.domain + "." + ext.suffix
    subdomain = ext.subdomain
    path = parsed.path

    features = {}

    # --------------------------------------------------------
    # BASIC LENGTH FEATURES
    # --------------------------------------------------------
    features["url_length"] = len(url)
    features["domain_length"] = len(domain)
    features["path_length"] = len(path)

    # --------------------------------------------------------
    # SUBDOMAIN FEATURES
    # --------------------------------------------------------
    subdomains = subdomain.split('.') if subdomain else []

    features["number_of_subdomains"] = len(subdomains)

    if subdomains:
        avg_len = sum(len(s) for s in subdomains) / len(subdomains)
    else:
        avg_len = 0

    features["average_subdomain_length"] = avg_len

    # Digits in subdomain
    features["number_of_digits_in_subdomain"] = sum(
        sum(c.isdigit() for c in s) for s in subdomains
    )

    # --------------------------------------------------------
    # ENTROPY FEATURES
    # --------------------------------------------------------
    features["entropy_of_url"] = calculate_entropy(url)
    features["entropy_of_domain"] = calculate_entropy(domain)

    # --------------------------------------------------------
    # CHARACTER COUNTS
    # --------------------------------------------------------
    features["number_of_special_char_in_url"] = len(re.findall(r'[^a-zA-Z0-9]', url))
    features["number_of_digits_in_url"] = sum(c.isdigit() for c in url)
    features["number_of_digits_in_domain"] = sum(c.isdigit() for c in domain)

    # --------------------------------------------------------
    # DOT / SLASH / SYMBOL COUNTS
    # --------------------------------------------------------
    features["number_of_dots_in_domain"] = domain.count('.')
    features["number_of_dots_in_url"] = url.count('.')
    features["number_of_slash_in_url"] = url.count('/')
    features["number_of_equal_in_url"] = url.count('=')
    features["number_of_questionmark_in_url"] = url.count('?')

    # --------------------------------------------------------
    # HYPHENS
    # --------------------------------------------------------
    features["number_of_hyphens_in_domain"] = domain.count('-')
    features["number_of_hyphens_in_url"] = url.count('-')

    # --------------------------------------------------------
    # DIGIT FLAGS
    # --------------------------------------------------------
    features["having_digits_in_domain"] = int(any(c.isdigit() for c in domain))

    # Repeated digits in domain (e.g., 111, 222)
    digit_counts = Counter(c for c in domain if c.isdigit())
    features["having_repeated_digits_in_domain"] = int(
        any(count > 1 for count in digit_counts.values())
    )

    return features


In [None]:
import joblib

model = joblib.load("/content/hybrid_model.pkl")


import pandas as pd

FEATURE_COLUMNS = [
    "url_length",
    "average_subdomain_length",
    "entropy_of_url",
    "entropy_of_domain",
    "domain_length",
    "number_of_subdomains",
    "number_of_special_char_in_url",
    "number_of_digits_in_url",
    "number_of_digits_in_domain",
    "number_of_dots_in_domain",
    "number_of_slash_in_url",
    "number_of_dots_in_url",
    "path_length",
    "number_of_hyphens_in_domain",
    "number_of_hyphens_in_url",
    "having_digits_in_domain",
    "number_of_equal_in_url",
    "number_of_digits_in_subdomain",
    "having_repeated_digits_in_domain",
    "number_of_questionmark_in_url"
]

def predict_url(url):

    # 1Ô∏è‚É£ BLACKLIST
    if is_blacklisted(url):
        return "üö® Phishing (Blacklist)", 1.0

    # 2Ô∏è‚É£ ML
    features = extract_features(url)
    df = pd.DataFrame([features])
    df = df.reindex(columns=FEATURE_COLUMNS, fill_value=0)

    prob = model.predict_proba(df)[0][1]

    if prob >= 0.5:
        return "‚ö†Ô∏è Phishing (ML)", prob
    else:
        return "‚úÖ Legitimate", prob

print(predict_url("https://www.google.com"))
print(predict_url("http://secure-login-bank.xyz"))



('‚úÖ Legitimate', np.float64(0.0005390283725638406))
('‚ö†Ô∏è Phishing (ML)', np.float64(0.6610612835518214))
