In [27]:

import os
import json
from pathlib import Path
import pandas as pd
import numpy as np

from string import printable
from sklearn import model_selection
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from keras.preprocessing import sequence
from keras.models import model_from_json
from keras.regularizers import l2

In [28]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Dropout, Activation, Lambda, Flatten, Input, ELU, LSTM, Embedding
from tensorflow.keras.layers import Conv2D, MaxPooling1D, MaxPooling2D, BatchNormalization, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend as K


In [29]:
def evaluate_result(y_true, y_pre):
    accuracy = accuracy_score(y_true, y_pre)
    precision = precision_score(y_true, y_pre)
    recall = recall_score(y_true, y_pre)
    f1 = f1_score(y_true, y_pre)
    auc = roc_auc_score(y_true, y_pre)

    print("Accuracy Score is: ", accuracy)
    print("Precision Score is :", precision)
    print("Recall Score is :", recall)
    print("F1 Score: ", f1)
    print("AUC Score: ", auc)

In [30]:
def to_y(labels):
    y = []
    for i in range(len(labels)):
        label = labels[i]
        if label < 0.5:
            y.append(0)

        else:
            y.append(1)

    return y

In [49]:
from keras.layers import Input, Embedding, Dropout, Convolution1D, ELU, Lambda, Dense, BatchNormalization, concatenate
from keras.optimizers import Adam
from keras import regularizers
import keras.backend as K
from keras.models import Model


def sum_1d(X):
    return tf.reduce_sum(X, axis=1)


def get_conv_layer(emb, kernel_size=5, filters=256):
    """
    Создаёт свёрточный слой с последующим объединением по осям и dropout.
    
    Параметры:
        emb (tensor): Входной тензор.
        kernel_size (int): Размер ядра свёртки.
        filters (int): Количество фильтров в свёрточном слое.
    
    Возвращает:
        tensor: Выходной тензор после применения свёртки и Dropout.
    """
    conv = Convolution1D(kernel_size=kernel_size, filters=filters, padding='same')(emb)
    conv = ELU()(conv)
    conv = Lambda(sum_1d, output_shape=(filters,))(conv)
    conv = Dropout(0.5)(conv)
    return conv


def build_convfully_model(max_len=75, emb_dim=32, max_vocab_len=100, reg_strength=1e-4):
    """
    Создаёт и возвращает модель ConvFully.
    
    Параметры:
        max_len (int): Максимальная длина последовательности.
        emb_dim (int): Размерность слоя Embedding.
        max_vocab_len (int): Максимальное количество токенов в словаре.
        reg_strength (float): Сила регуляризации.

    Возвращает:
        keras.Model: Скомпилированная модель ConvFully.
    """
    W_reg = regularizers.l2(reg_strength)

    # Входной слой
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')

    # Embedding слой
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                    embeddings_regularizer=W_reg)(main_input)
    emb = Dropout(0.25)(emb)

    # Свёрточные слои с различными размерами ядер
    conv1 = get_conv_layer(emb, kernel_size=2, filters=256)
    conv2 = get_conv_layer(emb, kernel_size=3, filters=256)
    conv3 = get_conv_layer(emb, kernel_size=4, filters=256)
    conv4 = get_conv_layer(emb, kernel_size=5, filters=256)

    # Объединение всех свёрточных слоёв
    merged = concatenate([conv1, conv2, conv3, conv4], axis=1)

    # Полносвязные слои
    hidden1 = Dense(1024)(merged)
    hidden1 = ELU()(hidden1)
    hidden1 = BatchNormalization()(hidden1)
    hidden1 = Dropout(0.5)(hidden1)

    hidden2 = Dense(1024)(hidden1)
    hidden2 = ELU()(hidden2)
    hidden2 = BatchNormalization()(hidden2)
    hidden2 = Dropout(0.5)(hidden2)

    # Выходной слой
    output = Dense(1, activation='sigmoid', name='output')(hidden2)

    # Создание модели
    model = Model(inputs=[main_input], outputs=[output])

    # Оптимизатор
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

    return model


In [32]:
def build_convlstm_model(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, reg_strength=1e-4):
    """
    Создаёт и возвращает модель ConvLSTM.

    Параметры:
        max_len (int): Максимальная длина последовательности.
        emb_dim (int): Размерность слоя Embedding.
        max_vocab_len (int): Максимальное количество токенов в словаре.
        lstm_output_size (int): Размер выходного слоя LSTM.
        reg_strength (float): Сила регуляризации.

    Возвращает:
        keras.Model: Скомпилированная модель ConvLSTM.
    """
    # Регуляризатор
    W_reg = l2(reg_strength)

    # Входной слой
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')

    # Embedding слой
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                    embeddings_regularizer=W_reg)(main_input)
    emb = Dropout(0.25)(emb)

    # Сверточный слой
    conv = Convolution1D(kernel_size=5, filters=256, padding='same')(emb)
    conv = ELU()(conv)

    # MaxPooling слой
    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    # LSTM слой
    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)

    # Выходной слой
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Создание модели
    model = Model(inputs=[main_input], outputs=[output])

    # Оптимизатор
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [33]:
def build_simple_lstm_model(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, reg_strength=1e-4):
    """
    Создаёт и возвращает модель LSTM.
    
    Параметры:
        max_len (int): Максимальная длина последовательности.
        emb_dim (int): Размерность слоя Embedding.
        max_vocab_len (int): Максимальное количество токенов в словаре.
        lstm_output_size (int): Размер выходного слоя LSTM.
        reg_strength (float): Сила регуляризации.

    Возвращает:
        keras.Model: Скомпилированная модель LSTM.
    """
    # Регуляризатор
    W_reg = l2(reg_strength)

    # Входной слой
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')

    # Embedding слой
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                    embeddings_regularizer=W_reg)(main_input)
    emb = Dropout(0.2)(emb)

    # LSTM слой
    lstm = LSTM(lstm_output_size)(emb)
    lstm = Dropout(0.5)(lstm)

    # Выходной слой
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Создание модели
    model = Model(inputs=[main_input], outputs=[output])

    # Оптимизатор
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
def print_layers_dims(model):
    l_layers = model.layers
    for i in range(len(l_layers)):
        print(l_layers[i])
        print('Input Shape: ', l_layers[i].input_shape,
              'Output Shape: ', l_layers[i].output_shape)

In [None]:
def save_model(model, fileModelJSON, fileWeights):

    if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
    json_string = model.to_json()
    with open(fileModelJSON, 'w') as f:
        json.dump(json_string, f)

    if Path(fileWeights).is_file():
        os.remove(fileWeights)
    model.save_weights(fileWeights)


def load_model(fileModelJSON, fileWeights):
    with open(fileModelJSON, 'r') as f:
        model_json = json.load(f)
        model = model_from_json(model_json)

    model.load_weights(fileWeights)
    return model

In [None]:
import kagglehub

path = kagglehub.dataset_download("sid321axn/malicious-urls-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Ксения\.cache\kagglehub\datasets\sid321axn\malicious-urls-dataset\versions\1


In [37]:
data_file_path = os.path.join("C:/Users/Ксения/.cache/kagglehub/datasets/sid321axn/malicious-urls-dataset/versions/1", 'malicious_phish.csv')
df = pd.read_csv(data_file_path)
df.head(10)

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign
6,espn.go.com/nba/player/_/id/3457/brandon-rush,benign
7,yourbittorrent.com/?q=anthony-hamilton-soulife,benign
8,http://www.pashminaonline.com/pure-pashminas,defacement
9,allmusic.com/album/crazy-from-the-heat-r16990,benign


In [38]:
df['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [39]:
type_to_label = {
    'benign': 0,       # Not Malicious
    'defacement': 1,   # Malicious
    'phishing': 1,     # Malicious
    'malware': 1       # Malicious
}

df['isMalicious'] = df['type'].map(type_to_label)

df['isMalicious'].value_counts()

isMalicious
0    428103
1    223088
Name: count, dtype: int64

In [40]:
url_lengths = df['url'].str.len()

min_length = url_lengths.min()
mean_length = url_lengths.mean()
max_length = url_lengths.max()

print(f"Минимальная длина URL: {min_length}")
print(f"Средняя длина URL: {mean_length:.2f}")
print(f"Максимальная длина URL: {max_length}")

Минимальная длина URL: 1
Средняя длина URL: 60.16
Максимальная длина URL: 2175


In [None]:
# Преобразуем необработанную строку URL в список листов, где символы, содержащиеся в printable хранятся в закодированном виде как целое число
url_int_tokens = [
    [printable.index(x) for x in url if x in printable] for url in df.url]

# Обрезаем строку URL до max_len или добиваем нулями, если короче
max_len = 75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

target = np.array(df.isMalicious)

print('Matrix dimensions of X: ', X.shape,
        'Vector dimension of target: ', target.shape)

X_train, X_test, target_train, target_test = model_selection.train_test_split(
    X, target, test_size=0.25, random_state=42)

Matrix dimensions of X:  (651191, 75) Vector dimension of target:  (651191,)


In [42]:
epochs_num = 10
batch_size = 32

In [46]:
model_name = "simple_lstm"
model = build_simple_lstm_model()
model.fit(X_train, target_train,
            epochs=epochs_num, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')

# print_layers_dims(model)
# save model
json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
save_model(model, json_file_path, weight_file_path)

Epoch 1/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 17ms/step - accuracy: 0.8349 - loss: 0.3987
Epoch 2/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 18ms/step - accuracy: 0.8921 - loss: 0.2684
Epoch 3/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 19ms/step - accuracy: 0.9106 - loss: 0.2372
Epoch 4/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 19ms/step - accuracy: 0.9264 - loss: 0.2070
Epoch 5/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 19ms/step - accuracy: 0.9350 - loss: 0.1835
Epoch 6/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 20ms/step - accuracy: 0.9384 - loss: 0.1735
Epoch 7/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 20ms/step - accuracy: 0.9402 - loss: 0.1678
Epoch 8/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 20ms/step - accuracy: 0.9424

FileNotFoundError: [Errno 2] No such file or directory: 'cache/malicious_url\\simple_lstm.json'

In [47]:
# model_name = "simple_lstm"
# json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
# weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
# model = load_model(json_file_path, weight_file_path)
y_pred = model.predict(X_test)
# print(y_pred)
pred = to_y(y_pred)
# print(pred)
#print(classification_report(target_test, pred, digits=5))
evaluate_result(target_test, pred)

[1m5088/5088[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 5ms/step
Accuracy Score is:  0.9474317866312854
Precision Score is : 0.9600940346752865
Recall Score is : 0.8825658664841795
F1 Score:  0.9196989884962561
AUC Score:  0.9317881118118537


In [50]:
model_name = "conv_fully"

model = build_convfully_model()
model.fit(X_train, target_train,
            epochs=epochs_num, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')

pred = model.predict(X_test)
print(classification_report(target_test, pred, digits=5))

# print_layers_dims(model)
# save model
# json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
# weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
# save_model(model, json_file_path, weight_file_path)

Epoch 1/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 24ms/step - accuracy: 0.8070 - loss: 0.4668
Epoch 2/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m408s[0m 27ms/step - accuracy: 0.9057 - loss: 0.2516
Epoch 3/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 29ms/step - accuracy: 0.9159 - loss: 0.2240
Epoch 4/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m455s[0m 30ms/step - accuracy: 0.9212 - loss: 0.2124
Epoch 5/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m466s[0m 31ms/step - accuracy: 0.9247 - loss: 0.2039
Epoch 6/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m482s[0m 32ms/step - accuracy: 0.9275 - loss: 0.1976
Epoch 7/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m489s[0m 32ms/step - accuracy: 0.9301 - loss: 0.1925
Epoch 8/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m491s[0m 32ms/step - accuracy: 0.9319

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [51]:
# model_name = "conv_fully"
# json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
# weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
# model = load_model(json_file_path, weight_file_path)
y_pred = model.predict(X_test)
# print(y_pred)
pred = to_y(y_pred)
# print(pred)
# print(classification_report(target_test, pred, digits=5))
evaluate_result(target_test, pred)

[1m5088/5088[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 7ms/step
Accuracy Score is:  0.9449440410815858
Precision Score is : 0.950139200371201
Recall Score is : 0.8850330457958905
F1 Score:  0.9164312420165404
AUC Score:  0.9304953425009993


In [52]:
model_name = "conv_lstm"

model = build_convlstm_model()
model.fit(X_train, target_train,
            epochs=epochs_num, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')

pred = model.predict(X_test)
print(classification_report(target_test, pred, digits=5))

# print_layers_dims(model)
# save model
json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
save_model(model, json_file_path, weight_file_path)

Epoch 1/10




[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 13ms/step - accuracy: 0.8525 - loss: 0.3527
Epoch 2/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 14ms/step - accuracy: 0.9247 - loss: 0.2041
Epoch 3/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 13ms/step - accuracy: 0.9354 - loss: 0.1770
Epoch 4/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 14ms/step - accuracy: 0.9433 - loss: 0.1601
Epoch 5/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 13ms/step - accuracy: 0.9476 - loss: 0.1499
Epoch 6/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 13ms/step - accuracy: 0.9503 - loss: 0.1425
Epoch 7/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 13ms/step - accuracy: 0.9519 - loss: 0.1390
Epoch 8/10
[1m15263/15263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 13ms/step - accuracy: 0.9534 - loss: 0.

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [53]:
# model_name = "conv_lstm"
# json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
# weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
# model = load_model(json_file_path, weight_file_path)
y_pred = model.predict(X_test)
# print(y_pred)
pred = to_y(y_pred)
# print(pred)
# print(classification_report(target_test, pred, digits=5))
evaluate_result(target_test, pred)

[1m5088/5088[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step
Accuracy Score is:  0.9636236317399477
Precision Score is : 0.9608859654012673
Recall Score is : 0.9312611428262709
F1 Score:  0.9458416402977704
AUC Score:  0.9558187898173343
