In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [None]:
# 📌 Step 4: Load the dataset
df = pd.read_csv("/content/dataset_phishing.csv")
print("Columns:", df.columns)
print(df.head())


Columns: Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_

In [None]:


print(df['status'].value_counts())

# Encode: phishing = 1, legitimate = 0
df['phishing'] = df['status'].map({'legitimate': 0, 'phishing': 1})

status
legitimate    5715
phishing      5715
Name: count, dtype: int64


In [None]:
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status,phishing
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,1,0,45,-1,0,1,1,4,legitimate,0
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,0,0,77,5767,0,0,1,2,phishing,1
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,0,0,14,4004,5828815,0,1,0,phishing,1
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,0,0,62,-1,107721,0,0,3,legitimate,0
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,1,0,224,8175,8725,0,0,6,legitimate,0


In [None]:
# 📌 Step 5: Split features and target
X = df.drop(columns=['phishing'])  # Replace if your target column has a different name
y = df['phishing']

In [None]:
# 📌 Step 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# 📌 Step 7: Build the Random Forest model
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

In [None]:
# 📌 Step 8: Train the model
model.fit(X_train, y_train)

✅ Why Random Forest for phishing detection
1️⃣ Handles many features well
Phishing detection usually uses a mix of:

URL-based features (length, special characters)

Domain info (WHOIS, age)

Technical flags (HTTPS, redirects)

Random Forest works well with:

Numerical & categorical features

High-dimensional data

Nonlinear relationships

2️⃣ Robust to overfitting

Unlike a single decision tree, Random Forest combines many trees.

Each tree sees a random sample of the data and a random sample of features.

This randomness + averaging → lower variance, better generalization.

3️⃣ Good default performance

For tabular data, it’s one of the strongest baseline models.

Often works “out of the box” with minimal tuning.

Rarely underperforms by a huge margin.

4️⃣ Easy to interpret feature importance

You can inspect which features matter for phishing detection.

E.g., is URL length the top factor? Is SSL presence more predictive?

5️⃣ Fast to train compared to complex models

Faster than XGBoost or deep learning for medium-sized datasets.

Parallelizable → uses all CPU cores (n_jobs=-1)

In [None]:
# 📌 Step 9: Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))



Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97     11600
           1       0.94      0.95      0.95      6130

    accuracy                           0.96     17730
   macro avg       0.96      0.96      0.96     17730
weighted avg       0.96      0.96      0.96     17730

Accuracy: 0.9642413987591653


⚡ Is this good?
✅ YES! — A 96% accuracy with high F1 for both legitimate and phishing classes is very good for phishing detection.
✅ Your recall for phishing (0.95) is key — you don’t want to miss phishing links.
✅ Your precision for phishing (0.94) means few false alarms.

In [None]:
# 📌 Step 10: Save the trained model
joblib.dump(model, "phishing_detector.pkl")
print("✅ Model saved as phishing_detector.pkl")

✅ Model saved as phishing_detector.pkl


In [None]:
# 📌 Install dependencies
!pip install -q scikit-learn pandas xgboost lightgbm joblib

# 📌 Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

import xgboost as xgb
import lightgbm as lgb
  # Pick your dataset_small.csv

# 📌 Load the dataset
df = pd.read_csv("/content/dataset_full (1).csv")
print("Columns:", df.columns)
print(df.head())

# 📌 Features & target
X = df.drop(columns=['phishing'])
y = df['phishing']

# 📌 Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------------------------------
# ✅ XGBoost Model
# ----------------------------------------------------
print("\n🚀 Training XGBoost...")

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=15,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

# Predict & Evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test)
print("\n📊 XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

# Save XGBoost model
joblib.dump(xgb_model, "phishing_detector_xgb.pkl")
print("✅ XGBoost model saved as phishing_detector_xgb.pkl")

# ----------------------------------------------------
# ✅ LightGBM Model
# ----------------------------------------------------
print("\n🚀 Training LightGBM...")

lgb_model = lgb.LGBMClassifier(
    n_estimators=250,
    max_depth=15,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(X_train, y_train)

# Predict & Evaluate LightGBM
y_pred_lgb = lgb_model.predict(X_test)
print("\n📊 LightGBM Classification Report:\n", classification_report(y_test, y_pred_lgb))
print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))

# Save LightGBM model
joblib.dump(lgb_model, "phishing_detector_lgb.pkl")
print("✅ LightGBM model saved as phishing_detector_lgb.pkl")



Columns: Index(['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url',
       'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url',
       'qty_exclamation_url', 'qty_space_url',
       ...
       'qty_ip_resolved', 'qty_nameservers', 'qty_mx_servers', 'ttl_hostname',
       'tls_ssl_certificate', 'qty_redirects', 'url_google_index',
       'domain_google_index', 'url_shortened', 'phishing'],
      dtype='object', length=112)
   qty_dot_url  qty_hyphen_url  qty_underline_url  qty_slash_url  \
0            3               0                  0              1   
1            5               0                  1              3   
2            2               0                  0              1   
3            4               0                  2              5   
4            2               0                  0              0   

   qty_questionmark_url  qty_equal_url  qty_at_url  qty_and_url  \
0                     0              0           0            