# GuardNet - Retrain Model (TF.js Compatible)

Notebook ini akan:
1. Membaca dataset PhiUSIIL
2. Menghitung 50 fitur **PERSIS** seperti sandbox.js
3. Train Logistic Regression
4. Export ke TensorFlow.js (format kompatibel)

**PENTING**: Menggunakan Keras 2.x untuk kompatibilitas dengan TensorFlow.js!

In [None]:
# === Cell 1: FORCE KERAS 2.x (WAJIB DIJALANKAN PERTAMA!) ===
# Ini HARUS dijalankan SEBELUM import tensorflow!
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

print('‚úÖ Keras 2.x legacy mode enabled')

In [None]:
# === Cell 2: Install Dependencies ===
!pip install tensorflowjs pandas scikit-learn tensorflow numpy

In [None]:
# === Cell 3: Import Libraries ===
import pandas as pd
import numpy as np
import json
import re
import math
from urllib.parse import urlparse
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

print(f"TensorFlow version: {tf.__version__}")

# Verifikasi menggunakan Keras 2.x
if hasattr(tf.keras, 'layers') and hasattr(tf.keras.layers, 'Dense'):
    print('‚úÖ Keras 2.x confirmed')
else:
    print('‚ö†Ô∏è Mungkin masih menggunakan Keras 3.x, restart runtime!')

In [None]:
# === Cell 4: Load Dataset ===
# OPSI 1: Upload langsung ke Colab (RECOMMENDED)
from google.colab import files
print('Upload file PhiUSIIL_Phishing_URL_Dataset.csv:')
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

# OPSI 2: Google Drive (uncomment jika pakai Drive)
# from google.colab import drive
# drive.mount('/content/drive')
# df = pd.read_csv('/content/drive/MyDrive/PhiUSIIL_Phishing_URL_Dataset.csv')

print(f"\nDataset shape: {df.shape}")
print(f"Label distribution:")
print(df['label'].value_counts())

In [None]:
# === Cell 5: Feature Extraction Functions (EXACT COPY dari sandbox.js) ===

def count_char(s, char):
    return s.count(char)

def calculate_entropy(s):
    if not s:
        return 0
    length = len(s)
    frequencies = {}
    for char in s:
        frequencies[char] = frequencies.get(char, 0) + 1
    entropy = 0
    for char, freq in frequencies.items():
        p = freq / length
        entropy -= p * math.log2(p)
    return entropy

COMMON_TLDS = ['com', 'org', 'net', 'edu', 'gov', 'io', 'co', 'id']

def extract_features_aligned(url_str):
    try:
        parsed = urlparse(url_str)
        hostname = parsed.hostname or ''
    except:
        return [0] * 22
    
    features = []
    
    # 1. URLLength
    url_length = len(url_str)
    features.append(url_length)
    
    # 2. DomainLength
    domain_length = len(hostname)
    features.append(domain_length)
    
    # 3. IsDomainIP
    is_ip = 1 if re.match(r'^(?:\d{1,3}\.){3}\d{1,3}$', hostname) else 0
    features.append(is_ip)
    
    # 4. URLSimilarityIndex (sandbox.js logic)
    url_similarity = 80 if (url_length < 50 and domain_length < 20) else 50
    features.append(url_similarity)
    
    # 5. CharContinuationRate
    max_seq = 0
    curr_seq = 1
    for i in range(1, len(url_str)):
        if url_str[i] == url_str[i-1]:
            curr_seq += 1
        else:
            max_seq = max(max_seq, curr_seq)
            curr_seq = 1
    max_seq = max(max_seq, curr_seq)
    char_continuation_rate = max_seq / url_length if url_length > 0 else 0
    features.append(char_continuation_rate)
    
    # 6. TLDLegitimateProb
    tld = hostname.split('.')[-1] if hostname else ''
    tld_prob = 0.9 if tld in COMMON_TLDS else 0.3
    features.append(tld_prob)
    
    # 7. URLCharProb
    url_entropy = calculate_entropy(url_str)
    url_char_prob = 1.0 / (url_entropy + 1)
    features.append(url_char_prob)
    
    # 8. TLDLength
    features.append(len(tld))
    
    # 9. NoOfSubDomain
    parts = hostname.split('.') if hostname else []
    num_subdomains = max(0, len(parts) - 2)
    features.append(num_subdomains)
    
    # 10. HasObfuscation
    has_obfuscation = 1 if re.search(r'%[0-9A-Fa-f]{2}', url_str) else 0
    features.append(has_obfuscation)
    
    # 11. NoOfObfuscatedChar
    num_obfuscated = len(re.findall(r'%[0-9A-Fa-f]{2}', url_str))
    features.append(num_obfuscated)
    
    # 12. ObfuscationRatio
    obfuscation_ratio = num_obfuscated / url_length if url_length > 0 else 0
    features.append(obfuscation_ratio)
    
    # 13. NoOfLettersInURL
    num_letters = len(re.findall(r'[a-zA-Z]', url_str))
    features.append(num_letters)
    
    # 14. LetterRatioInURL
    letter_ratio = num_letters / url_length if url_length > 0 else 0
    features.append(letter_ratio)
    
    # 15. NoOfDigitsInURL
    num_digits = len(re.findall(r'\d', url_str))
    features.append(num_digits)
    
    # 16. DigitRatioInURL
    digit_ratio = num_digits / url_length if url_length > 0 else 0
    features.append(digit_ratio)
    
    # 17. NoOfEqualsInURL
    features.append(count_char(url_str, '='))
    
    # 18. NoOfQMarkInURL
    features.append(count_char(url_str, '?'))
    
    # 19. NoOfAmpersandInURL
    features.append(count_char(url_str, '&'))
    
    # 20. NoOfOtherSpecialCharsInURL
    num_special = len(re.findall(r'[^a-zA-Z0-9\s]', url_str))
    features.append(num_special)
    
    # 21. SpecialCharRatioInURL
    special_ratio = num_special / url_length if url_length > 0 else 0
    features.append(special_ratio)
    
    # 22. IsHTTPS
    is_https = 1 if url_str.startswith('https://') else 0
    features.append(is_https)
    
    return features

print("‚úÖ Feature extraction functions defined.")

In [None]:
# === Cell 6: Define Feature Names ===
URL_FEATURES = [
    'URLLength', 'DomainLength', 'IsDomainIP', 'URLSimilarityIndex',
    'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength',
    'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio',
    'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL',
    'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL',
    'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS'
]

CONTENT_FEATURES = [
    'LineOfCode', 'LargestLineLength', 'HasTitle', 'DomainTitleMatchScore',
    'URLTitleMatchScore', 'HasFavicon', 'Robots', 'IsResponsive',
    'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup',
    'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton',
    'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto',
    'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef',
    'NoOfEmptyRef', 'NoOfExternalRef'
]

ALL_FEATURES = URL_FEATURES + CONTENT_FEATURES
print(f"Total features: {len(ALL_FEATURES)}")

In [None]:
# === Cell 7: Extract URL Features ===
print("Computing URL features with aligned extraction...")

url_features_list = []
for idx, row in df.iterrows():
    url = row['URL']
    features = extract_features_aligned(url)
    url_features_list.append(features)
    if idx % 10000 == 0:
        print(f"Processed {idx}/{len(df)} rows...")

url_features_df = pd.DataFrame(url_features_list, columns=URL_FEATURES)
print(f"\n‚úÖ URL features computed: {url_features_df.shape}")

In [None]:
# === Cell 8: Combine Features ===
content_df = df[CONTENT_FEATURES].copy()
X_combined = pd.concat([url_features_df.reset_index(drop=True), 
                        content_df.reset_index(drop=True)], axis=1)
X_combined = X_combined.fillna(0)

X = X_combined.values
y = df['label'].values

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Class distribution: 0={sum(y==0)}, 1={sum(y==1)}")

In [None]:
# === Cell 9: Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# === Cell 10: StandardScaler ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler params
scaler_params = {
    'mean': scaler.mean_.tolist(),
    'std': scaler.scale_.tolist(),
    'feature_names': ALL_FEATURES
}

with open('scaler_params.json', 'w') as f:
    json.dump(scaler_params, f, indent=2)

print("‚úÖ Scaler parameters saved")

In [None]:
# === Cell 11: Train Logistic Regression ===
sklearn_lr = LogisticRegression(
    max_iter=2000, 
    class_weight='balanced', 
    solver='lbfgs',
    random_state=42
)

sklearn_lr.fit(X_train_scaled, y_train)

y_pred = sklearn_lr.predict(X_test_scaled)
y_proba = sklearn_lr.predict_proba(X_test_scaled)[:, 1]

print(f"\n=== Model Evaluation ===")
print(f"Accuracy  : {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision : {precision_score(y_test, y_pred):.5f}")
print(f"Recall    : {recall_score(y_test, y_pred):.5f}")
print(f"F1-Score  : {f1_score(y_test, y_pred):.5f}")
print(f"ROC-AUC   : {roc_auc_score(y_test, y_proba):.5f}")

In [None]:
# === Cell 12: Build Keras 2.x Model ===
# PENTING: Ini menggunakan Keras 2.x API
keras_model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(50,)),
    tf.keras.layers.Dense(1, activation='sigmoid', name='dense')
])

# Transfer weights dari sklearn
sklearn_weights = sklearn_lr.coef_.T
sklearn_bias = sklearn_lr.intercept_
keras_model.layers[0].set_weights([sklearn_weights, sklearn_bias])

keras_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

keras_model.summary()
print("\n‚úÖ Keras 2.x model created")

In [None]:
# === Cell 13: Verify Keras Model ===
y_pred_keras = (keras_model.predict(X_test_scaled) > 0.5).astype(int).flatten()
match_rate = (y_pred_keras == y_pred).mean()
print(f"Prediction match rate with sklearn: {match_rate*100:.2f}%")

In [None]:
# === Cell 14: Export to TensorFlow.js ===
import tensorflowjs as tfjs

os.makedirs('tfjs_model', exist_ok=True)
tfjs.converters.save_keras_model(keras_model, 'tfjs_model')

print("\n‚úÖ TensorFlow.js model saved")
print("\nFiles created:")
for f in os.listdir('tfjs_model'):
    size = os.path.getsize(f'tfjs_model/{f}')
    print(f"  - {f} ({size} bytes)")

In [None]:
# === Cell 15: Verify model.json format ===
with open('tfjs_model/model.json', 'r') as f:
    model_config = json.load(f)

# Check for Keras 2.x format
topology = model_config.get('modelTopology', {})
keras_version = topology.get('keras_version', 'unknown')
print(f"Keras version in export: {keras_version}")

# Check InputLayer config
layers = topology.get('model_config', {}).get('config', {}).get('layers', [])
if layers:
    first_layer = layers[0]
    config = first_layer.get('config', {})
    if 'batch_input_shape' in config:
        print("‚úÖ Format compatible: batch_input_shape found")
    elif 'batch_shape' in config:
        print("‚ö†Ô∏è Keras 3.x format detected: batch_shape - RESTART RUNTIME!")
    else:
        print(f"Layer config: {config}")

In [None]:
# === Cell 16: Copy scaler and create zip ===
import shutil
shutil.copy('scaler_params.json', 'tfjs_model/scaler_params.json')

!cd tfjs_model && zip -r ../guardnet_model_v2.zip .

print("\n" + "="*50)
print("‚úÖ SEMUA FILE SIAP!")
print("="*50)
for f in os.listdir('tfjs_model'):
    print(f"  üìÑ {f}")

In [None]:
# === Cell 17: Download ===
from google.colab import files
files.download('guardnet_model_v2.zip')

print("\n" + "="*60)
print("LANGKAH SELANJUTNYA:")
print("="*60)
print("1. Extract guardnet_model_v2.zip")
print("2. Copy SEMUA file ke folder GuardNet/models/:")
print("   - model.json")
print("   - group1-shard1of1.bin")
print("   - scaler_params.json")
print("3. Reload extension di chrome://extensions")
print("4. Test!")