# GuardNet: FIXED Pipeline Training (No Data Leakage)

This notebook trains phishing detection models using URL-only features with proper sklearn Pipelines to prevent data leakage.

---

## Cell 1: Install Dependencies

In [None]:
!pip install -q pandas numpy matplotlib seaborn scikit-learn

## Cell 2: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    confusion_matrix, roc_curve, auc, precision_recall_curve,
    accuracy_score, precision_score, recall_score, f1_score, average_precision_score
)
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ All libraries imported successfully!")

## Cell 3: Upload Dataset

In [None]:
from google.colab import files
import os

print("üì§ Please upload your dataset file (PhiUSIIL_Phishing_URL_Dataset.csv)")
uploaded = files.upload()

dataset_file = list(uploaded.keys())[0]
print(f"\n‚úÖ Dataset uploaded: {dataset_file}")
print(f"   File size: {os.path.getsize(dataset_file) / (1024*1024):.2f} MB")

## Cell 4: Load Data & Feature Separation

In [None]:
print("="*80)
print("GuardNet: FIXED Pipeline Training (No Data Leakage)")
print("="*80)

print("\n[1/12] Loading dataset...")
df = pd.read_csv(dataset_file)
print(f"Dataset shape: {df.shape}")
print(f"Total samples: {df.shape[0]:,}")

print("\n[2/12] Separating URL-only features from HTML features...")

# URL-only features (static, extracted from URL string only)
url_features = [
    'URLLength', 'DomainLength', 'IsDomainIP', 'TLDLength',
    'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio',
    'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL',
    'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
    'SpacialCharRatioInURL', 'IsHTTPS', 'URLSimilarityIndex', 'CharContinuationRate',
    'TLDLegitimateProb', 'URLCharProb'
]

existing_url_features = [f for f in url_features if f in df.columns]
print(f"  URL-only features: {len(existing_url_features)}")

X_url = df[existing_url_features]
y = df['label']

class_counts = y.value_counts()
print(f"\nüìä Class Distribution:")
print(f"  Legitimate (1): {class_counts[1]:,} ({class_counts[1]/len(y)*100:.2f}%)")
print(f"  Phishing (0):   {class_counts[0]:,} ({class_counts[0]/len(y)*100:.2f}%)")

## Cell 5: Visualization - Class Distribution

In [None]:
print("\n[3/12] Creating class distribution visualization...")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

colors = ['#ff6b6b', '#51cf66']
ax1.pie(class_counts.values, labels=['Phishing', 'Legitimate'], autopct='%1.1f%%',
        colors=colors, startangle=90, textprops={'fontsize': 12, 'weight': 'bold'})
ax1.set_title('Dataset Class Distribution', fontsize=14, weight='bold', pad=20)

bars = ax2.bar(['Phishing', 'Legitimate'], class_counts.values, color=colors, alpha=0.8, edgecolor='black')
ax2.set_ylabel('Number of Samples', fontsize=12, weight='bold')
ax2.set_title('Sample Count by Class', fontsize=14, weight='bold', pad=20)
ax2.grid(axis='y', alpha=0.3)
for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height):,}', ha='center', va='bottom', fontsize=11, weight='bold')

plt.tight_layout()
plt.savefig('01_class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 01_class_distribution.png")

## Cell 6: Split Data & Create Pipelines

In [None]:
print("\n[4/12] Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(
    X_url, y, test_size=0.2, random_state=42, stratify=y
)
print(f"  Training set: {X_train.shape[0]:,} samples")
print(f"  Test set:     {X_test.shape[0]:,} samples")
print(f"  Features:     {X_train.shape[1]} (URL-only)")

print("\n[5/12] Creating sklearn Pipelines (FIXED - no leakage)...")

lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1))
])

rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        n_estimators=100, max_depth=20, min_samples_split=5,
        min_samples_leaf=2, random_state=42, n_jobs=-1
    ))
])

print("   ‚úÖ Pipelines created (scaler + model)")

## Cell 7: Train Models

In [None]:
print("\n[6/12] Training models...")

print("  üîµ Training Logistic Regression Pipeline...")
lr_pipeline.fit(X_train, y_train)
print("     ‚úÖ LR Pipeline trained")

print("  üå≤ Training Random Forest Pipeline...")
rf_pipeline.fit(X_train, y_train)
print("     ‚úÖ RF Pipeline trained")

with open('lr_pipeline.pkl', 'wb') as f:
    pickle.dump(lr_pipeline, f)
with open('rf_pipeline.pkl', 'wb') as f:
    pickle.dump(rf_pipeline, f)
print("   ‚úÖ Pipelines saved")

## Cell 8: Generate Predictions & Calculate Metrics

In [None]:
print("\n[7/12] Generating predictions...")
y_pred_lr = lr_pipeline.predict(X_test)
y_pred_rf = rf_pipeline.predict(X_test)
y_proba_lr = lr_pipeline.predict_proba(X_test)[:, 1]
y_proba_rf = rf_pipeline.predict_proba(X_test)[:, 1]

y_proba_hybrid = (y_proba_lr + y_proba_rf) / 2
y_pred_hybrid = (y_proba_hybrid >= 0.5).astype(int)

print("\n[8/12] Calculating performance metrics...")
models = {
    'Logistic Regression': (y_pred_lr, y_proba_lr),
    'Random Forest': (y_pred_rf, y_proba_rf),
    'Hybrid Model': (y_pred_hybrid, y_proba_hybrid)
}

metrics_results = {}
for model_name, (y_pred, y_proba) in models.items():
    metrics_results[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'AUC': auc(*roc_curve(y_test, y_proba)[:2])
    }
    print(f"\n  üìä {model_name}:")
    for metric, value in metrics_results[model_name].items():
        print(f"    {metric:12s}: {value:.4f}")

## Cell 9: Visualization - Performance Metrics

In [None]:
print("\n[9/12] Creating performance metrics comparison...")
fig, ax = plt.subplots(figsize=(12, 6))

metrics_df = pd.DataFrame(metrics_results).T
x = np.arange(len(metrics_df.columns))
width = 0.25

colors = ['#3498db', '#2ecc71', '#e74c3c']
for i, (model, color) in enumerate(zip(metrics_df.index, colors)):
    offset = width * (i - 1)
    bars = ax.bar(x + offset, metrics_df.loc[model], width, 
                   label=model, color=color, alpha=0.8, edgecolor='black')
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9, weight='bold')

ax.set_xlabel('Metrics', fontsize=12, weight='bold')
ax.set_ylabel('Score', fontsize=12, weight='bold')
ax.set_title('Model Performance Comparison (URL-only Features)', fontsize=14, weight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels(metrics_df.columns, fontsize=11)
ax.legend(fontsize=11, loc='lower right')
ax.set_ylim(0, 1.1)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('02_performance_metrics.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 02_performance_metrics.png")

## Cell 10: Visualization - Confusion Matrices

In [None]:
print("\n[10/12] Creating confusion matrices...")
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (model_name, (y_pred, _)) in enumerate(models.items()):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                cbar_kws={'label': 'Count'}, annot_kws={'size': 14, 'weight': 'bold'})
    axes[idx].set_title(f'{model_name}\nConfusion Matrix', fontsize=12, weight='bold')
    axes[idx].set_ylabel('True Label', fontsize=11, weight='bold')
    axes[idx].set_xlabel('Predicted Label', fontsize=11, weight='bold')
    axes[idx].set_xticklabels(['Phishing', 'Legitimate'])
    axes[idx].set_yticklabels(['Phishing', 'Legitimate'])

plt.tight_layout()
plt.savefig('03_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 03_confusion_matrices.png")

## Cell 11: Visualization - ROC Curves

In [None]:
print("\n[11/12] Creating ROC curves...")
fig, ax = plt.subplots(figsize=(10, 8))

colors_roc = ['#3498db', '#2ecc71', '#e74c3c']
for (model_name, (_, y_proba)), color in zip(models.items(), colors_roc):
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, color=color, lw=2.5, label=f'{model_name} (AUC = {roc_auc:.4f})')

ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier (AUC = 0.5000)')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12, weight='bold')
ax.set_ylabel('True Positive Rate', fontsize=12, weight='bold')
ax.set_title('ROC Curves - URL-only Features', fontsize=14, weight='bold', pad=20)
ax.legend(loc='lower right', fontsize=11)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('04_roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 04_roc_curves.png")

## Cell 12: Visualization - Precision-Recall Curves

In [None]:
print("\n[12/12] Creating Precision-Recall curves...")
fig, ax = plt.subplots(figsize=(10, 8))

for (model_name, (_, y_proba)), color in zip(models.items(), colors_roc):
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    avg_precision = average_precision_score(y_test, y_proba)
    ax.plot(recall, precision, color=color, lw=2.5,
            label=f'{model_name} (AP = {avg_precision:.4f})')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('Recall', fontsize=12, weight='bold')
ax.set_ylabel('Precision', fontsize=12, weight='bold')
ax.set_title('Precision-Recall Curves - URL-only Features', fontsize=14, weight='bold', pad=20)
ax.legend(loc='lower left', fontsize=11)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('05_precision_recall_curves.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 05_precision_recall_curves.png")

## Cell 13: Visualization - Feature Importance

In [None]:
print("\nCreating feature importance plot...")
rf_model = rf_pipeline.named_steps['classifier']
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

fig, ax = plt.subplots(figsize=(10, 8))
bars = ax.barh(range(len(feature_importance)), feature_importance['importance'], 
               color='#2ecc71', alpha=0.8, edgecolor='black')
ax.set_yticks(range(len(feature_importance)))
ax.set_yticklabels(feature_importance['feature'], fontsize=10)
ax.set_xlabel('Importance Score', fontsize=12, weight='bold')
ax.set_title('Top 20 Feature Importance (Random Forest, URL-only)', fontsize=14, weight='bold', pad=20)
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)

for i, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height()/2.,
            f'{width:.4f}', ha='left', va='center', fontsize=9, weight='bold')

plt.tight_layout()
plt.savefig('06_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 06_feature_importance.png")

## Cell 14: Visualization - Learning Curve (FIXED)

In [None]:
print("\nCreating learning curve (FIXED - using Pipeline)...")
train_sizes, train_scores, val_scores = learning_curve(
    rf_pipeline, X_train, y_train,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5, n_jobs=-1, random_state=42
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(train_sizes, train_mean, 'o-', color='#3498db', lw=2.5, label='Training Score')
ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='#3498db')
ax.plot(train_sizes, val_mean, 'o-', color='#e74c3c', lw=2.5, label='Validation Score')
ax.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.2, color='#e74c3c')

ax.set_xlabel('Training Set Size', fontsize=12, weight='bold')
ax.set_ylabel('Score', fontsize=12, weight='bold')
ax.set_title('Learning Curve (Random Forest, FIXED Pipeline)', fontsize=14, weight='bold', pad=20)
ax.legend(loc='lower right', fontsize=11)
ax.grid(alpha=0.3)
ax.set_ylim(0.7, 1.05)

plt.tight_layout()
plt.savefig('07_learning_curve.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 07_learning_curve.png")

## Cell 15: Visualization - Score Distribution

In [None]:
print("\nCreating score distribution plot...")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

phishing_scores = y_proba_hybrid[y_test == 0]
legitimate_scores = y_proba_hybrid[y_test == 1]

axes[0].hist(phishing_scores, bins=50, alpha=0.7, color='#ff6b6b', label='Phishing (Actual)', edgecolor='black')
axes[0].hist(legitimate_scores, bins=50, alpha=0.7, color='#51cf66', label='Legitimate (Actual)', edgecolor='black')
axes[0].axvline(x=0.5, color='black', linestyle='--', lw=2, label='Decision Threshold')
axes[0].set_xlabel('Prediction Probability', fontsize=12, weight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, weight='bold')
axes[0].set_title('Prediction Score Distribution (Hybrid Model)', fontsize=13, weight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(alpha=0.3)

data_to_plot = [phishing_scores, legitimate_scores]
bp = axes[1].boxplot(data_to_plot, labels=['Phishing', 'Legitimate'], patch_artist=True, widths=0.6)
colors = ['#ff6b6b', '#51cf66']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
axes[1].axhline(y=0.5, color='black', linestyle='--', lw=2, label='Decision Threshold')
axes[1].set_ylabel('Prediction Probability', fontsize=12, weight='bold')
axes[1].set_title('Score Distribution by Actual Class', fontsize=13, weight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('08_score_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 08_score_distribution.png")

## Cell 16: Cross-Validation (FIXED)

In [None]:
print("\nPerforming cross-validation (FIXED - using Pipeline)...")
cv_scores_lr = cross_val_score(lr_pipeline, X_train, y_train, cv=5, n_jobs=-1)
cv_scores_rf = cross_val_score(rf_pipeline, X_train, y_train, cv=5, n_jobs=-1)

fig, ax = plt.subplots(figsize=(10, 6))
bp = ax.boxplot([cv_scores_lr, cv_scores_rf], 
                 labels=['Logistic Regression', 'Random Forest'],
                 patch_artist=True, widths=0.5)

colors = ['#3498db', '#2ecc71']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_ylabel('Accuracy Score', fontsize=12, weight='bold')
ax.set_title('5-Fold Cross-Validation Results (FIXED Pipeline)', fontsize=14, weight='bold', pad=20)
ax.grid(alpha=0.3, axis='y')

means = [cv_scores_lr.mean(), cv_scores_rf.mean()]
for i, mean in enumerate(means, 1):
    ax.text(i, mean, f'Œº = {mean:.4f}', ha='center', va='bottom', 
            fontsize=10, weight='bold', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.savefig('09_cross_validation.png', dpi=300, bbox_inches='tight')
plt.show()
print("   ‚úÖ Saved: 09_cross_validation.png")

print(f"\n  CV Scores (Logistic Regression): {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")
print(f"  CV Scores (Random Forest):       {cv_scores_rf.mean():.4f} (+/- {cv_scores_rf.std():.4f})")

## Cell 17: Save Report & Summary

In [None]:
print("\nSaving summary report...")
report = {
    'dataset': {
        'total_samples': len(df),
        'features_used': 'URL-only (22 features)',
        'feature_count': len(existing_url_features),
        'phishing_samples': int(class_counts[0]),
        'legitimate_samples': int(class_counts[1])
    },
    'pipeline': {
        'fixed': True,
        'uses_sklearn_pipeline': True,
        'no_data_leakage': True
    },
    'models': {},
    'cross_validation': {
        'Logistic Regression': {'mean': float(cv_scores_lr.mean()), 'std': float(cv_scores_lr.std())},
        'Random Forest': {'mean': float(cv_scores_rf.mean()), 'std': float(cv_scores_rf.std())}
    }
}

for model_name, metrics in metrics_results.items():
    report['models'][model_name] = {k: float(v) for k, v in metrics.items()}

with open('training_report_fixed.json', 'w') as f:
    json.dump(report, f, indent=2)
print("   ‚úÖ Saved: training_report_fixed.json")

print("\n" + "="*80)
print("‚úÖ TRAINING COMPLETE (FIXED PIPELINE - NO DATA LEAKAGE)")
print("="*80)
print("\nüîß FIXES APPLIED:")
print("  ‚úÖ StandardScaler wrapped inside sklearn Pipeline")
print("  ‚úÖ Cross-validation uses Pipeline (no leakage)")
print("  ‚úÖ Learning curve uses Pipeline (no leakage)")
print("  ‚úÖ URL-only features (22 features) for main evaluation")
print("\nüìù NOTE: Metrics may be lower than before - this is CORRECT.")
print("   Previous high scores were due to data leakage.")
print("   These scores represent TRUE generalization performance.")
print("="*80)

## Cell 18: Download All Files

In [None]:
from google.colab import files
import glob

print("üì• Downloading all generated files...\n")

for png_file in glob.glob('*.png'):
    print(f"  Downloading: {png_file}")
    files.download(png_file)

for pkl_file in ['lr_pipeline.pkl', 'rf_pipeline.pkl']:
    print(f"  Downloading: {pkl_file}")
    files.download(pkl_file)

print(f"  Downloading: training_report_fixed.json")
files.download('training_report_fixed.json')

print("\n‚úÖ All files downloaded!")