In [8]:
import pickle
import sys
sys.path.append("..")

In [9]:
import pandas as pd
data = pd.read_csv("../data/data_processed.csv")
test_data = pd.read_csv("../data/ISCXURL2016/all_concated.csv")

In [11]:
data.describe()

Unnamed: 0,url,label
count,420463,420464
unique,409943,2
top,103.234.36.75/rd927.ex,good
freq,27,344821


In [13]:
#word level
from utils import vectorization
word_vectorizer = vectorization.get_word_vectorizer_v2(data['url'].values.astype('U'))

In [14]:
from utils.data_util import postpad_to
word_tokenizer = word_vectorizer.build_tokenizer()
test_data_x = postpad_to(test_data['url'].map(lambda url: [word_vectorizer.vocabulary_.get(a, -1)+2 for a in word_tokenizer(url)]), 200)

In [15]:
test_data_y = [1 if label == "bad" else 0 for label in test_data["label"]]

In [16]:
import tensorflow.keras as k

def create_conv_subnet(input_layer, conv_kernel_sizes, prefix=''):
    convolutions = list()
    for kernel_size in conv_kernel_sizes:
        x = k.layers.Conv1D(
            filters=32,
            kernel_size=kernel_size,
            padding='same',
            activation='relu',
            name=f'{prefix}_conv_{kernel_size}'
        )(input_layer)
        x = k.layers.MaxPool1D()(x)
        convolutions.append(x)

    x = k.layers.concatenate(convolutions, axis=2)
    x = k.layers.Flatten()(x)
    x = k.layers.Dropout(0.5, name=f'{prefix}_dropout')(x)
    x = k.layers.Dense(512, name=f'{prefix}_dense', activation='relu')(x)
    return x

def create_url_net(input_length, emb_dim, conv_kernel_sizes):
    word_input = k.layers.Input(shape=[input_length], name='word')

    x = create_conv_subnet(
        k.layers.Embedding(2+LW, emb_dim, mask_zero=True)(word_input),
        conv_kernel_sizes,
        'word'
    )

    x = k.layers.Dense(128, activation='relu', name='dense_1')(x)
    x = k.layers.Dense(1, activation='sigmoid', name='dense_comb_out')(x)

    model = k.models.Model(inputs=[word_input], outputs=[x])
    return model

LW = len(word_vectorizer.vocabulary_)
cnn_word_model = create_url_net(
    input_length=200,
    emb_dim=16,
    conv_kernel_sizes=[3,5]
)

In [17]:
cnn_word_model = k.models.load_model('./cnn_word_level')

In [18]:
test_pred_y = cnn_word_model.predict(test_data_x)

In [19]:
test_pred_y

array([[0.0581749 ],
       [0.06142358],
       [0.08769293],
       ...,
       [0.18632708],
       [0.37794793],
       [0.86762923]], dtype=float32)

In [20]:
type(test_pred_y)

numpy.ndarray

In [21]:
test_pred_y_round = [1 if i > 0.5 else 0 for i in test_pred_y]

In [22]:
from sklearn.metrics import classification_report
print(classification_report(test_data_y, test_pred_y_round))

              precision    recall  f1-score   support

           0       0.23      0.63      0.33     35378
           1       0.80      0.42      0.55    129988

    accuracy                           0.46    165366
   macro avg       0.52      0.52      0.44    165366
weighted avg       0.68      0.46      0.50    165366



In [23]:
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support, accuracy_score
import numpy as np

In [None]:
fpr, tpr, thresholds = roc_curve(test_data_y, test_pred_y)
auc_ = auc(fpr, tpr)
best_threshold = thresholds[np.argmax(-fpr + tpr)]
print("best threshold: ", best_threshold)

In [None]:
import matplotlib.pyplot as plt

model_name = "CNN Word"
model_full_name = "CNN with Word Level Embedding"

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label=f'{model_name} (area = {auc_:.3f})')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title(f'ROC curve for {model_full_name}')
plt.legend(loc='best')
plt.savefig(f'../plots/{model_name.replace(" ", "_").lower()}_roc.pdf')
plt.show()

In [None]:
test_pred_y_round = [1 if i > 0.27 else 0 for i in test_pred_y]
print(classification_report(test_data_y, test_pred_y_round))