# üõ°Ô∏è GuardNet - Random Forest Training (URL-ONLY Features)

## ‚ö†Ô∏è PENTING: Notebook ini dioptimalkan untuk browser extension

Notebook ini melatih RF **HANYA dengan URL-based features** karena:
- CORS memblokir fetch content dari banyak website
- Content-based features (LineOfCode, NoOfImage, dll) tidak tersedia saat runtime
- Model yang menggunakan content features akan memberikan prediksi yang salah

---

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")

## üì§ Step 1: Upload Dataset

In [None]:
print("üìÇ Please upload the dataset file (PhiUSIIL_Phishing_URL_Dataset.csv)...")
uploaded = files.upload()
filename = list(uploaded.keys())[0]
print(f"\n‚úÖ File uploaded: {filename}")

In [None]:
df = pd.read_csv(filename)
print(f"üìä Dataset shape: {df.shape}")
print(f"\nüìä Label distribution:")
print(df['label'].value_counts())
print(f"\n0 = Phishing, 1 = Legitimate")

## üîß Step 2: Define URL-ONLY Features

Hanya menggunakan fitur yang bisa diekstrak dari URL saja (tanpa fetch content).

In [None]:
# URL-ONLY features (dapat diekstrak tanpa fetch content)
URL_ONLY_FEATURES = [
    'URLLength',              # Panjang URL
    'DomainLength',           # Panjang domain
    'IsDomainIP',             # Apakah domain adalah IP
    'URLSimilarityIndex',     # Similarity dengan brand populer
    'CharContinuationRate',   # Rate karakter berulang
    'TLDLegitimateProb',      # Probabilitas TLD legitimate
    'URLCharProb',            # Probabilitas karakter URL
    'TLDLength',              # Panjang TLD
    'NoOfSubDomain',          # Jumlah subdomain
    'HasObfuscation',         # Ada obfuscation?
    'NoOfObfuscatedChar',     # Jumlah karakter obfuscated
    'ObfuscationRatio',       # Ratio obfuscation
    'NoOfLettersInURL',       # Jumlah huruf di URL
    'LetterRatioInURL',       # Ratio huruf di URL
    'NoOfDigitsInURL',        # Jumlah digit di URL (note: dataset uses 'Degits')
    'DigitRatioInURL',        # Ratio digit di URL
    'NoOfEqualsInURL',        # Jumlah '=' di URL
    'NoOfQMarkInURL',         # Jumlah '?' di URL
    'NoOfAmpersandInURL',     # Jumlah '&' di URL
    'NoOfOtherSpecialCharsInURL',  # Jumlah special chars lain
    'SpacialCharRatioInURL',  # Ratio special chars (note: typo in dataset)
    'IsHTTPS',                # Pakai HTTPS?
]

# Map feature names to handle typos in dataset
FEATURE_MAP = {
    'NoOfDigitsInURL': 'NoOfDegitsInURL',  # Typo in dataset
    'DigitRatioInURL': 'DegitRatioInURL',  # Typo in dataset
    'SpacialCharRatioInURL': 'SpacialCharRatioInURL',  # Keep as is
}

print(f"üìã Using {len(URL_ONLY_FEATURES)} URL-only features")
print(f"\nüö´ Excluding content-based features:")
print("   - LineOfCode, LargestLineLength")
print("   - HasTitle, DomainTitleMatchScore, URLTitleMatchScore")
print("   - HasFavicon, Robots, IsResponsive")
print("   - NoOfURLRedirect, NoOfSelfRedirect")
print("   - HasDescription, NoOfPopup, NoOfiFrame")
print("   - HasExternalFormSubmit, HasSocialNet")
print("   - HasSubmitButton, HasHiddenFields, HasPasswordField")
print("   - Bank, Pay, Crypto, HasCopyrightInfo")
print("   - NoOfImage, NoOfCSS, NoOfJS")
print("   - NoOfSelfRef, NoOfEmptyRef, NoOfExternalRef")

In [None]:
# Prepare features
X = pd.DataFrame()

for feat in URL_ONLY_FEATURES:
    # Check for mapped name (typo handling)
    dataset_feat = FEATURE_MAP.get(feat, feat)
    
    if dataset_feat in df.columns:
        X[feat] = df[dataset_feat]
    elif feat in df.columns:
        X[feat] = df[feat]
    else:
        print(f"‚ö†Ô∏è Feature not found: {feat} (mapped: {dataset_feat})")
        X[feat] = 0

# Target variable
y = df['label']

# Handle missing values
X = X.fillna(0)
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0)

print(f"\n‚úÖ Features prepared: {X.shape}")
print(f"\nFeature columns: {list(X.columns)}")

## üéØ Step 3: Train Random Forest

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"üìä Training set: {X_train.shape[0]} samples")
print(f"üìä Test set: {X_test.shape[0]} samples")

In [None]:
# Train Random Forest dengan hyperparameters yang lebih baik
rf_model = RandomForestClassifier(
    n_estimators=15,        # Lebih banyak trees untuk stabilitas
    max_depth=6,            # Sedikit lebih dalam
    min_samples_leaf=50,    # Leaf yang lebih besar untuk generalisasi
    min_samples_split=100,  # Split yang lebih conservative
    random_state=42,
    n_jobs=-1,
    class_weight='balanced' # Handle class imbalance
)

print("üöÄ Training Random Forest (URL-only features)...")
rf_model.fit(X_train, y_train)
print("‚úÖ Training complete!")

# Check class order
print(f"\nüìã Class order: {rf_model.classes_}")
print("   classes_[0] = label 0 = Phishing")
print("   classes_[1] = label 1 = Legitimate")

In [None]:
# Evaluate model
y_pred = rf_model.predict(X_test)

print("üìä Model Evaluation (URL-only features):")
print("=" * 50)
print(f"\n‚úÖ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"\nüìã Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Phishing (0)', 'Legitimate (1)']))
print(f"\nüî¢ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': URL_ONLY_FEATURES,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("üèÜ Feature Importance (URL-only):")
print("=" * 50)
for i, row in feature_importance.iterrows():
    bar = "‚ñà" * int(row['importance'] * 50)
    print(f"{row['feature']:25s} {row['importance']:.4f} {bar}")

## üì¶ Step 4: Export Model to JSON

In [None]:
def tree_to_json(tree, feature_names):
    """
    Convert sklearn DecisionTree to JSON format for JavaScript.
    Value format: [P(class_0), P(class_1)] = [P(Phishing), P(Legitimate)]
    """
    tree_ = tree.tree_

    def recurse(node):
        if tree_.feature[node] != -2:  # Not a leaf node
            feature_index = int(tree_.feature[node])
            threshold = float(tree_.threshold[node])
            left_child = int(tree_.children_left[node])
            right_child = int(tree_.children_right[node])

            return {
                "featureIndex": feature_index,
                "featureName": feature_names[feature_index],  # For debugging
                "threshold": round(threshold, 4),
                "left": recurse(left_child),
                "right": recurse(right_child)
            }
        else:  # Leaf node
            value = tree_.value[node][0]
            total = sum(value)
            # probs[0] = P(Phishing), probs[1] = P(Legitimate)
            probs = [round(v / total, 4) for v in value]
            return {"value": probs}

    return recurse(0)


def export_rf_to_json(rf_model, feature_names, accuracy, output_path='rf_model.json'):
    """
    Export Random Forest model to JSON format for JavaScript.
    """
    trees_json = []
    for i, tree in enumerate(rf_model.estimators_):
        tree_json = tree_to_json(tree, feature_names)
        trees_json.append(tree_json)
        print(f"  Tree {i+1}/{len(rf_model.estimators_)} exported")

    model_json = {
        "model_info": {
            "name": "GuardNet Random Forest (URL-Only)",
            "version": "2.0.0",
            "description": "RF trained on URL-only features for browser compatibility",
            "n_estimators": len(rf_model.estimators_),
            "max_depth": rf_model.max_depth,
            "n_features": len(feature_names),
            "accuracy": round(accuracy, 4),
            "feature_type": "url_only",
            "class_order": "[P(Phishing), P(Legitimate)]"
        },
        "feature_names": feature_names,
        "n_estimators": len(rf_model.estimators_),
        "max_depth": rf_model.max_depth,
        "trees": trees_json
    }

    with open(output_path, 'w') as f:
        json.dump(model_json, f, indent=2)

    import os
    file_size = os.path.getsize(output_path) / 1024

    return output_path, file_size


print("üì¶ Exporting Random Forest to JSON...")
accuracy = accuracy_score(y_test, y_pred)
output_path, file_size = export_rf_to_json(rf_model, URL_ONLY_FEATURES, accuracy)
print(f"\n‚úÖ Model exported to: {output_path}")
print(f"üìÅ File size: {file_size:.2f} KB")

## üß™ Step 5: Verify the Model

In [None]:
# Test dengan beberapa URL dari dataset
print("üß™ Testing model predictions...")
print("=" * 50)

# Test samples
test_samples = [
    (X_test.iloc[0:1], y_test.iloc[0], "Sample 1"),
    (X_test.iloc[5:6], y_test.iloc[5], "Sample 2"),
    (X_test.iloc[10:11], y_test.iloc[10], "Sample 3"),
]

for features, actual_label, name in test_samples:
    proba = rf_model.predict_proba(features)[0]
    pred_label = rf_model.predict(features)[0]
    print(f"\n{name}:")
    print(f"  Actual: {'Phishing' if actual_label == 0 else 'Legitimate'}")
    print(f"  Predicted: {'Phishing' if pred_label == 0 else 'Legitimate'}")
    print(f"  P(Phishing): {proba[0]:.4f}")
    print(f"  P(Legitimate): {proba[1]:.4f}")

## üì• Step 6: Download Model

In [None]:
print("üì• Downloading rf_model.json...")
files.download('rf_model.json')
print("\n‚úÖ Download complete!")
print("\nüìù Next steps:")
print("  1. Letakkan file rf_model.json di folder: GuardNet Test 1.2/models/")
print("  2. Replace file rf_model.json yang sudah ada")
print("  3. Update sandbox.js untuk menggunakan URL-only features")
print("  4. Set RF_ENABLED: true di HYBRID_CONFIG")
print("  5. Reload extension di Chrome")