In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from joblib import dump
import os

# Load the dataset
df = pd.read_csv("dataset_full.csv")

# Separate features and target
X = df.drop(columns=["phishing"])
y = df["phishing"]

# Normalize the feature set
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Step 1: Train XGBoost for feature importance & transformation
xgb_model = XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict with XGBoost (used as intermediate features)
X_train_xgb = xgb_model.apply(X_train)
X_test_xgb = xgb_model.apply(X_test)

# Reshape the output for LSTM input: (samples, timesteps, features)
X_train_lstm = np.reshape(X_train_xgb, (X_train_xgb.shape[0], X_train_xgb.shape[1], 1))
X_test_lstm = np.reshape(X_test_xgb, (X_test_xgb.shape[0], X_test_xgb.shape[1], 1))

# Step 2: LSTM model for final classification
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(X_train_lstm.shape[1], 1), return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(32, activation='relu'))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

# Train the LSTM model
lstm_model.fit(X_train_lstm, y_train, epochs=5, batch_size=64, validation_data=(X_test_lstm, y_test))

# Evaluate
y_pred = (lstm_model.predict(X_test_lstm) > 0.5).astype(int)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Save models
os.makedirs("saved_models", exist_ok=True)
xgb_model_path = "saved_models/xgboost_model.joblib"
lstm_model_path = "saved_models/lstm_model.h5"
scaler_path = "saved_models/scaler.joblib"

dump(xgb_model, xgb_model_path)
dump(scaler, scaler_path)
lstm_model.save(lstm_model_path)

print(f"\n✅ Models saved successfully to 'saved_models/' folder.")


Parameters: { "use_label_encoder" } are not used.

  super().__init__(**kwargs)


Epoch 1/5
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 39ms/step - accuracy: 0.7454 - loss: 0.4267 - val_accuracy: 0.8222 - val_loss: 0.3331
Epoch 2/5
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 37ms/step - accuracy: 0.8661 - loss: 0.2837 - val_accuracy: 0.8856 - val_loss: 0.2439
Epoch 3/5
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 37ms/step - accuracy: 0.8976 - loss: 0.2323 - val_accuracy: 0.9110 - val_loss: 0.2041
Epoch 4/5
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 35ms/step - accuracy: 0.9032 - loss: 0.2236 - val_accuracy: 0.8866 - val_loss: 0.2520
Epoch 5/5
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 36ms/step - accuracy: 0.9063 - loss: 0.2125 - val_accuracy: 0.9092 - val_loss: 0.2108
[1m555/555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step





Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.96      0.93     11600
           1       0.91      0.82      0.86      6130

    accuracy                           0.91     17730
   macro avg       0.91      0.89      0.90     17730
weighted avg       0.91      0.91      0.91     17730

Accuracy Score: 0.9091934574168077

✅ Models saved successfully to 'saved_models/' folder.
