In [2]:
!pip install datasets
!pip install xgboost
!pip install scikit-learn
!pip install pandas numpy



In [3]:
import pandas as pd
import numpy as np
import re
import math
from urllib.parse import urlparse
from collections import Counter
from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import joblib

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


In [4]:
def extract_features(url):
    """
    Extract handcrafted features from a URL for phishing detection.
    Compatible with both Colab training and Streamlit deployment.
    """
    features = {}

    try:
        # Parse the URL
        parsed = urlparse(url)
        hostname = parsed.netloc
        path = parsed.path
        query = parsed.query

        # 1. URL Length Features
        features['url_length'] = len(url)
        features['hostname_length'] = len(hostname)
        features['path_length'] = len(path)

        # 2. Count Special Characters
        features['num_dots'] = url.count('.')
        features['num_hyphens'] = url.count('-')
        features['num_underscores'] = url.count('_')
        features['num_slashes'] = url.count('/')
        features['num_question_marks'] = url.count('?')
        features['num_equals'] = url.count('=')
        features['num_at'] = url.count('@')
        features['num_ampersands'] = url.count('&')
        features['num_percent'] = url.count('%')

        # 3. Count Digits
        features['num_digits'] = sum(c.isdigit() for c in url)
        features['digit_ratio'] = sum(c.isdigit() for c in url) / max(len(url), 1)

        # 4. Special Character Ratio
        special_chars = len(re.findall(r'[^a-zA-Z0-9]', url))
        features['special_char_ratio'] = special_chars / max(len(url), 1)

        # 5. Number of Subdomains
        if hostname:
            subdomains = hostname.split('.')
            features['num_subdomains'] = max(len(subdomains) - 2, 0)
        else:
            features['num_subdomains'] = 0

        # 6. HTTPS Check
        features['is_https'] = 1 if parsed.scheme == 'https' else 0

        # 7. IP Address Detection
        ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
        features['has_ip_address'] = 1 if re.search(ip_pattern, hostname) else 0

        # 8. Suspicious TLD Check
        suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click']
        features['suspicious_tld'] = 1 if any(url.endswith(tld) for tld in suspicious_tlds) else 0

        # 9. Query Parameters Count
        features['num_query_params'] = len(query.split('&')) if query else 0

        # 10. Shannon Entropy of Hostname
        if hostname:
            entropy = 0
            for count in Counter(hostname).values():
                probability = count / len(hostname)
                entropy -= probability * math.log2(probability)
            features['hostname_entropy'] = entropy
        else:
            features['hostname_entropy'] = 0

        # 11. Suspicious Keywords
        suspicious_keywords = ['login', 'signin', 'bank', 'account', 'update',
                              'verify', 'secure', 'password', 'confirm', 'admin']
        features['has_suspicious_keyword'] = 1 if any(kw in url.lower() for kw in suspicious_keywords) else 0

        # 12. Abnormal URL Features
        features['abnormal_url'] = 1 if hostname and hostname not in url else 0

        # 13. Double Slash in Path
        features['double_slash_redirecting'] = 1 if '//' in path else 0

    except Exception as e:
        print(f"Error parsing URL: {url}, Error: {e}")
        features = {
            'url_length': 0, 'hostname_length': 0, 'path_length': 0,
            'num_dots': 0, 'num_hyphens': 0, 'num_underscores': 0,
            'num_slashes': 0, 'num_question_marks': 0, 'num_equals': 0,
            'num_at': 0, 'num_ampersands': 0, 'num_percent': 0,
            'num_digits': 0, 'digit_ratio': 0, 'special_char_ratio': 0,
            'num_subdomains': 0, 'is_https': 0, 'has_ip_address': 0,
            'suspicious_tld': 0, 'num_query_params': 0, 'hostname_entropy': 0,
            'has_suspicious_keyword': 0, 'abnormal_url': 0, 'double_slash_redirecting': 0
        }

    return features

print("✅ Feature extraction function defined!")

✅ Feature extraction function defined!


In [5]:
print("Loading dataset from HuggingFace...")
dataset = load_dataset("shawhin/phishing-site-classification")

# Convert to pandas DataFrame
df = pd.DataFrame(dataset['train'])

print(f"✅ Dataset loaded successfully!")
print(f"Total samples: {len(df)}")
print(f"\nDataset columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

# Show label distribution (handle different column names)
label_col = 'label' if 'label' in df.columns else 'labels'
print(f"\nLabel distribution:")
print(df[label_col].value_counts())

Loading dataset from HuggingFace...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/98.0k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/21.4k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/450 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/450 [00:00<?, ? examples/s]

✅ Dataset loaded successfully!
Total samples: 2100

Dataset columns: ['text', 'labels']

First few rows:
                                                text  labels
0  http://bazurashop.com/idex.html?sfm_from_ifram...       1
1                            hollywoodland.org/?p=29       0
2  tunnekylmyysmiddletonii.02leds.com/me4xcdste0....       1
3     usa-people-search.com/Find-Carla-Brown-IA.aspx       0
4                 inspire-consultants.com.my/487ygfh       1

Label distribution:
labels
0    1054
1    1046
Name: count, dtype: int64


In [6]:
print("Cleaning and renaming columns...")

# Rename columns to standard names
if 'text' in df.columns:
    df = df.rename(columns={'text': 'url'})
elif 'url' not in df.columns:
    # Find the URL column
    url_col = [col for col in df.columns if 'url' in col.lower() or 'text' in col.lower()][0]
    df = df.rename(columns={url_col: 'url'})

if 'labels' in df.columns:
    df = df.rename(columns={'labels': 'label'})
elif 'label' not in df.columns:
    # Find the label column
    label_col = [col for col in df.columns if 'label' in col.lower() or 'target' in col.lower()][0]
    df = df.rename(columns={label_col: 'label'})

# Remove any rows with missing URLs
df = df.dropna(subset=['url'])

# Ensure labels are binary (0 and 1)
# Handle various label formats
if df['label'].dtype == 'object' or df['label'].dtype == 'string':
    df['label'] = df['label'].map({
        'legitimate': 0, 'phishing': 1,
        'good': 0, 'bad': 1,
        'benign': 0, 'malicious': 1,
        '0': 0, '1': 1,
        0: 0, 1: 1
    })
else:
    # Already numeric, ensure it's 0 or 1
    df['label'] = df['label'].astype(int)

# Remove any remaining NaN labels
df = df.dropna(subset=['label'])

# Verify labels are only 0 and 1
unique_labels = df['label'].unique()
print(f"Unique labels found: {unique_labels}")

print(f"\n✅ Data cleaned!")
print(f"Final dataset size: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nLabel distribution:")
print(f"  Legitimate (0): {sum(df['label'] == 0)}")
print(f"  Phishing (1): {sum(df['label'] == 1)}")


Cleaning and renaming columns...
Unique labels found: [1 0]

✅ Data cleaned!
Final dataset size: 2100
Columns: ['url', 'label']

Label distribution:
  Legitimate (0): 1054
  Phishing (1): 1046


In [7]:
print("Extracting features from all URLs...")
print("This may take a few minutes...")

features_list = []
for idx, url in enumerate(df['url']):
    if idx % 1000 == 0:
        print(f"Progress: {idx}/{len(df)} URLs processed", end='\r')
    features_list.append(extract_features(url))

print(f"\nProgress: {len(df)}/{len(df)} URLs processed - Complete!")

# Create features DataFrame
X = pd.DataFrame(features_list)
y = df['label'].values

print(f"\n✅ Feature extraction complete!")
print(f"Feature matrix shape: {X.shape}")
print(f"Features: {X.columns.tolist()}")
print(f"\nFeature statistics:")
print(X.describe())

Extracting features from all URLs...
This may take a few minutes...
Progress: 0/2100 URLs processedProgress: 1000/2100 URLs processedProgress: 2000/2100 URLs processed
Progress: 2100/2100 URLs processed - Complete!

✅ Feature extraction complete!
Feature matrix shape: (2100, 24)
Features: ['url_length', 'hostname_length', 'path_length', 'num_dots', 'num_hyphens', 'num_underscores', 'num_slashes', 'num_question_marks', 'num_equals', 'num_at', 'num_ampersands', 'num_percent', 'num_digits', 'digit_ratio', 'special_char_ratio', 'num_subdomains', 'is_https', 'has_ip_address', 'suspicious_tld', 'num_query_params', 'hostname_entropy', 'has_suspicious_keyword', 'abnormal_url', 'double_slash_redirecting']

Feature statistics:
        url_length  hostname_length  path_length     num_dots  num_hyphens  \
count  2100.000000      2100.000000  2100.000000  2100.000000  2100.000000   
mean     52.278571         0.006667    43.502381     2.248571     0.875238   
std      45.404382         0.305505 

In [8]:
print("\nSplitting data into train and test sets (80-20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✅ Data split complete!")
print(f"Training samples: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Testing samples: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")
print(f"Training - Phishing: {sum(y_train)}, Legitimate: {len(y_train) - sum(y_train)}")
print(f"Testing - Phishing: {sum(y_test)}, Legitimate: {len(y_test) - sum(y_test)}")



Splitting data into train and test sets (80-20)...
✅ Data split complete!
Training samples: 1680 (80.0%)
Testing samples: 420 (20.0%)
Training - Phishing: 837, Legitimate: 843
Testing - Phishing: 209, Legitimate: 211


In [9]:
print("\n" + "="*60)
print("TRAINING RANDOM FOREST MODEL")
print("="*60)

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("Training Random Forest...")
rf_model.fit(X_train, y_train)
print("✅ Random Forest training complete!")

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Calculate metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)

print("\n" + "="*60)
print("RANDOM FOREST - MODEL EVALUATION")
print("="*60)
print(f"\n🎯 Accuracy: {rf_accuracy * 100:.2f}%\n")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Legitimate', 'Phishing']))
print("\nConfusion Matrix:")
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(f"                 Predicted")
print(f"              Legit  Phishing")
print(f"Actual Legit   {cm_rf[0][0]:5d}    {cm_rf[0][1]:5d}")
print(f"     Phishing  {cm_rf[1][0]:5d}    {cm_rf[1][1]:5d}")

# Feature Importance
print("\n📊 Top 10 Most Important Features (Random Forest):")
feature_importance_rf = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance_rf.head(10).iterrows():
    print(f"  {row['feature']:30s} {row['importance']:.4f}")


TRAINING RANDOM FOREST MODEL
Training Random Forest...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.4s


✅ Random Forest training complete!

RANDOM FOREST - MODEL EVALUATION

🎯 Accuracy: 77.62%

Classification Report:
              precision    recall  f1-score   support

  Legitimate       0.76      0.82      0.79       211
    Phishing       0.80      0.74      0.77       209

    accuracy                           0.78       420
   macro avg       0.78      0.78      0.78       420
weighted avg       0.78      0.78      0.78       420


Confusion Matrix:
                 Predicted
              Legit  Phishing
Actual Legit     172       39
     Phishing     55      154

📊 Top 10 Most Important Features (Random Forest):
  special_char_ratio             0.1426
  url_length                     0.1360
  path_length                    0.1138
  num_digits                     0.1028
  digit_ratio                    0.0971
  num_slashes                    0.0966
  has_suspicious_keyword         0.0783
  num_dots                       0.0751
  num_hyphens                    0.0725
  num_undersc

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished


In [10]:
print("\n" + "="*60)
print("TRAINING XGBOOST MODEL")
print("="*60)

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

print("Training XGBoost...")
xgb_model.fit(X_train, y_train)
print("✅ XGBoost training complete!")

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Calculate metrics
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

print("\n" + "="*60)
print("XGBOOST - MODEL EVALUATION")
print("="*60)
print(f"\n🎯 Accuracy: {xgb_accuracy * 100:.2f}%\n")
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Legitimate', 'Phishing']))
print("\nConfusion Matrix:")
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print(f"                 Predicted")
print(f"              Legit  Phishing")
print(f"Actual Legit   {cm_xgb[0][0]:5d}    {cm_xgb[0][1]:5d}")
print(f"     Phishing  {cm_xgb[1][0]:5d}    {cm_xgb[1][1]:5d}")

# Feature Importance
print("\n📊 Top 10 Most Important Features (XGBoost):")
feature_importance_xgb = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance_xgb.head(10).iterrows():
    print(f"  {row['feature']:30s} {row['importance']:.4f}")



TRAINING XGBOOST MODEL
Training XGBoost...
✅ XGBoost training complete!

XGBOOST - MODEL EVALUATION

🎯 Accuracy: 78.10%

Classification Report:
              precision    recall  f1-score   support

  Legitimate       0.76      0.82      0.79       211
    Phishing       0.81      0.74      0.77       209

    accuracy                           0.78       420
   macro avg       0.78      0.78      0.78       420
weighted avg       0.78      0.78      0.78       420


Confusion Matrix:
                 Predicted
              Legit  Phishing
Actual Legit     174       37
     Phishing     55      154

📊 Top 10 Most Important Features (XGBoost):
  has_suspicious_keyword         0.4420
  num_digits                     0.0705
  num_hyphens                    0.0580
  num_slashes                    0.0554
  num_underscores                0.0517
  num_equals                     0.0441
  num_at                         0.0404
  num_question_marks             0.0387
  num_dots                 

In [11]:
print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print(f"XGBoost Accuracy:       {xgb_accuracy * 100:.2f}%")

if rf_accuracy >= xgb_accuracy:
    best_model = rf_model
    best_model_name = "Random Forest"
    best_accuracy = rf_accuracy
    print(f"\n🏆 Best Model: Random Forest")
else:
    best_model = xgb_model
    best_model_name = "XGBoost"
    best_accuracy = xgb_accuracy
    print(f"\n🏆 Best Model: XGBoost")

print(f"Best Model Accuracy: {best_accuracy * 100:.2f}%")



MODEL COMPARISON
Random Forest Accuracy: 77.62%
XGBoost Accuracy:       78.10%

🏆 Best Model: XGBoost
Best Model Accuracy: 78.10%


In [12]:
print("\n" + "="*60)
print("SAVING MODEL AND FEATURES")
print("="*60)

# Save the best model
joblib.dump(best_model, 'phishing_detector_model.pkl')
print(f"✅ Model saved as 'phishing_detector_model.pkl'")

# Save feature column names (important for consistent prediction)
feature_columns = X.columns.tolist()
joblib.dump(feature_columns, 'feature_columns.pkl')
print(f"✅ Feature columns saved as 'feature_columns.pkl'")

# Save model info
model_info = {
    'model_name': best_model_name,
    'accuracy': best_accuracy,
    'n_features': len(feature_columns),
    'feature_names': feature_columns
}
joblib.dump(model_info, 'model_info.pkl')
print(f"✅ Model info saved as 'model_info.pkl'")

print("\n" + "="*60)
print("✅ TRAINING COMPLETE!")
print("="*60)
print("\nFiles created:")
print("  1. phishing_detector_model.pkl")
print("  2. feature_columns.pkl")
print("  3. model_info.pkl")
print("\nDownload these files to use with Streamlit app!")
print("="*60)


SAVING MODEL AND FEATURES
✅ Model saved as 'phishing_detector_model.pkl'
✅ Feature columns saved as 'feature_columns.pkl'
✅ Model info saved as 'model_info.pkl'

✅ TRAINING COMPLETE!

Files created:
  1. phishing_detector_model.pkl
  2. feature_columns.pkl
  3. model_info.pkl

Download these files to use with Streamlit app!


In [13]:
from google.colab import files
files.download('phishing_detector_model.pkl')
files.download('feature_columns.pkl')
files.download('model_info.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>