In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
import json 
import re
import random
random.seed(28)
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import keras
from keras.layers import *
from keras.models import Sequential
from tensorflow.keras.layers import *
from keras.callbacks import ModelCheckpoint

import tensorflow as tf
# from tensorflow.keras import layers
# from keras.models import load_model
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
# tf.test.is_gpu_available(
#     cuda_only=False,
#     min_cuda_compute_capability=None
# )

In [None]:
def load_data():
    legit_file = open('/kaggle/input/phising/phising/legitimate_url.json', 'r') 
    legit_data = json.load(legit_file)

    phishing_file = open('/kaggle/input/phising/phising/phishing_url.json', 'r') 
    phishing_data = json.load(phishing_file)

    df_1 = pd.DataFrame(legit_data, columns = ['url'])
    df_1['is_phishing'] = 0

    df_2 = pd.DataFrame(phishing_data, columns = ['url'])
    df_2['is_phishing'] = 1

    df = pd.concat([df_1,df_2], axis = 0)
    del df_1, df_2
    
    return df


In [None]:
df_url = load_data()

In [None]:
df_url['is_phishing'].value_counts()

In [None]:
#df['url'].iloc[27777]

In [None]:
urls = list(df_url['url'].values)

In [None]:
char2idx = dict()
max_url_seq_length = 0

for url in tqdm(urls):
    max_url_seq_length = max(max_url_seq_length, len(url))
    for c in url:
        if c not in char2idx:
            char2idx[c] = len(char2idx)
num_input_tokens = len(char2idx)
idx2char = dict([(idx, c) for c, idx in char2idx.items()])

config = dict()
config['num_input_tokens'] = num_input_tokens
config['char2idx'] = char2idx
config['idx2char'] = idx2char
config['max_url_seq_length'] = max_url_seq_length

In [None]:
import keras
data_size = df_url.shape[0]
X = np.zeros(shape=(data_size, max_url_seq_length))
Y = keras.utils.to_categorical(df_url['is_phishing'])


In [None]:
for i in tqdm(range(data_size)):
    url = df_url['url'].iloc[i]
    #label = df_url['is_phishing'].iloc[i]
    for idx, c in enumerate(url):
        X[i, idx] = char2idx[c]
    

In [None]:
X

In [None]:
num_input_tokens = config['num_input_tokens']
char2idx = config['char2idx']
idx2char = config['idx2char']
max_url_seq_length = config['max_url_seq_length']

In [None]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, 
                                                test_size=0.2, 
                                                stratify = Y,
                                                random_state=42)

#### Bi-LSTM Model

In [4]:
EMBEDDING_SIZE = 100

In [None]:
def make_bidirectional_lstm_model(num_input_tokens, max_len):
    model = tf.keras.Sequential()
    model.add(layers.Embedding(input_dim = num_input_tokens, 
                            output_dim= EMBEDDING_SIZE, 
                            input_length=max_len))
    model.add(layers.SpatialDropout1D(0.2))
    model.add(layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(units=64,
                                 input_shape=(max_len, EMBEDDING_SIZE))))
    model.add(layers.Dense(2, activation='softmax'))
    return model


In [None]:
num_input_tokens = num_input_tokens
max_len = max_url_seq_length

In [None]:
model = make_bidirectional_lstm_model(num_input_tokens, max_len)

In [None]:
model.summary()

In [None]:
lr_opt = 1e-3
epochs = 5

opt = Adam(lr=lr_opt, decay= lr_opt / epochs)

In [None]:
model.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, to_file='/kaggle/working/model.png')

In [None]:
batch_size = 128
epochs = 2

In [None]:
history = model.fit(Xtrain, Ytrain, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    verbose=1,
                    validation_data=(Xtest, Ytest), 
                   )

In [None]:
#model.save("/kaggle/working/lstm_model.h5")

In [None]:
#Plotting
# plt.figure()
# plt.plot(np.arange(0, 2), history.history["loss"], label="train_loss")
# plt.plot(np.arange(0, 2), history.history["val_loss"], label="val_loss")
# plt.plot(np.arange(0, 2), history.history["accuracy"], label="train_acc")
# plt.plot(np.arange(0, 2), history.history["val_accuracy"], label="val_acc")
# plt.title("Plotting Loss and Accuracy on COVID-19 Image Dataset")
# plt.xlabel("No of epochs")
# plt.ylabel("Loss&Accyracy")
# plt.legend(loc="lower left")

In [None]:
predIdxs = model.predict(Xtest)
predIdxs = np.argmax(predIdxs, axis=1)

In [None]:
Ytest = np.argmax(Ytest, axis=1)

In [None]:
import sklearn
print(classification_report(Ytest, predIdxs,target_names=['no_phishing', 'phishing']))

In [None]:
print(sklearn.metrics.confusion_matrix(Ytest, predIdxs))

In [None]:
from sklearn.metrics import roc_curve
