In [1]:
# Step 1: Download dataset using kagglehub
import kagglehub

path = kagglehub.dataset_download("sid321axn/malicious-urls-dataset")
print("Dataset downloaded at:", path)

# Step 2: Load and preprocess data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, GlobalMaxPooling1D

df = pd.read_csv(f"{path}/malicious_phish.csv")
df.dropna(inplace=True)

X = df['url']
y = df['type']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=100)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Step 3: Build and train LSTM model
model_lstm = Sequential([
    Embedding(5000, 64, input_length=100),
    LSTM(64),
    Dense(64, activation='relu'),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])
model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_lstm.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

# Step 4: Build and train CNN model
model_cnn = Sequential([
    Embedding(5000, 64, input_length=100),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])
model_cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_cnn.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

# Step 5: Evaluation
print("\nLSTM Evaluation:")
y_pred_lstm = model_lstm.predict(X_test).argmax(axis=1)
print(classification_report(y_test, y_pred_lstm, target_names=le.classes_))

print("\nCNN Evaluation:")
y_pred_cnn = model_cnn.predict(X_test).argmax(axis=1)
print(classification_report(y_test, y_pred_cnn, target_names=le.classes_))


Downloading from https://www.kaggle.com/api/v1/datasets/download/sid321axn/malicious-urls-dataset?dataset_version_number=1...


100%|██████████| 16.9M/16.9M [00:00<00:00, 42.1MB/s]

Extracting files...





Dataset downloaded at: /root/.cache/kagglehub/datasets/sid321axn/malicious-urls-dataset/versions/1




Epoch 1/5
[1m16280/16280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1303s[0m 80ms/step - accuracy: 0.9222 - loss: 0.2115 - val_accuracy: 0.9568 - val_loss: 0.1164
Epoch 2/5
[1m16280/16280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1187s[0m 73ms/step - accuracy: 0.9593 - loss: 0.1085 - val_accuracy: 0.9599 - val_loss: 0.1072
Epoch 3/5
[1m16280/16280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1231s[0m 73ms/step - accuracy: 0.9632 - loss: 0.0963 - val_accuracy: 0.9614 - val_loss: 0.1025
Epoch 4/5
[1m16280/16280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1286s[0m 79ms/step - accuracy: 0.9651 - loss: 0.0907 - val_accuracy: 0.9621 - val_loss: 0.1010
Epoch 5/5
[1m16280/16280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1257s[0m 77ms/step - accuracy: 0.9662 - loss: 0.0873 - val_accuracy: 0.9622 - val_loss: 0.1028
Epoch 1/5
[1m16280/16280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m521s[0m 32ms/step - accuracy: 0.9332 - loss: 0.1817 - val_accuracy: 0.9582 - val_