In [1]:
import pandas as pd 
import numpy as np
import os
from sklearn.model_selection import train_test_split
from string import printable 
from keras.utils import pad_sequences
import tensorflow as tf
import json
from tensorflow.keras.models import model_from_json
from pathlib import Path
import tensorflow as tf
from keras.models import Model
from keras import regularizers, Sequential
from keras.layers import Dense, Dropout, Activation, Lambda, Flatten, Input, ELU, LSTM, Embedding, BatchNormalization, Conv1D, concatenate, MaxPooling1D
from keras.preprocessing import sequence
from keras.optimizers import Adam
from keras.utils import np_utils
from keras import backend as K
from sklearn.preprocessing import LabelEncoder

In [5]:
def read_data():
  df = pd.read_csv("Phishing_dataset.csv")
  df.drop(df.columns[0], axis=1, inplace=True)
  url_int_tokens = [
      [printable.index(x) + 1 for x in url if x in printable] for url in df.iloc[:, 0]
  ]

  max_len = 75
  X = pad_sequences(url_int_tokens, maxlen=max_len)
  le1 = LabelEncoder()

  df['Label'] = le1.fit_transform(df['Label'])
  target = np.array(df['Label'])
  x_train, x_test, target_train, target_test = train_test_split(X, target, test_size=0.25, random_state=42)

  return x_train, x_test, target_train, target_test


In [6]:
x_train, x_test, target_train, target_test = read_data()

In [8]:
max_len = 75
emb_dim = 32
max_vocab_len = 101
lstm_output_size = 32
W_reg = regularizers.l2(1e-4)
epochs_num = 10
batch_size = 32

In [9]:
def save_model(model, fileModelJSON, fileWeights):
  if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
  json_string = model.to_json()
  with open(fileModelJSON, 'w') as f:
      json.dump(json_string, f)

  if Path(fileWeights).is_file():
      os.remove(fileWeights)
  model.save_weights(fileWeights)


def load_model(fileModelJSON, fileWeights):
    with open(fileModelJSON, 'r') as f:
        model_json = json.load(f)
        model = model_from_json(model_json)

    model.load_weights(fileWeights)
    return model

# CNN

In [10]:
model1 = Sequential()
model1.add(Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len, embeddings_regularizer=W_reg))
model1.add(Conv1D(kernel_size=2, filters=256, padding='same', activation='elu'))
model1.add(Conv1D(kernel_size=3, filters=256, padding='same', activation='elu'))
model1.add(Conv1D(kernel_size=4, filters=256, padding='same', activation='elu'))
model1.add(Conv1D(kernel_size=5, filters=256, padding='same', activation='elu'))
model1.add(Dense(1024))
model1.add(ELU())
model1.add(BatchNormalization())
model1.add(Dense(1024))
model1.add(ELU())
model1.add(BatchNormalization())
model1.add(Dense(1, activation='sigmoid'))

model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
y_train = np.asarray(target_train).astype('float32').reshape((-1,1))
y_test = np.asarray(target_test).astype('float32').reshape((-1, 1))
model1.fit(x_train, y_train, epochs=epochs_num, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdddf351b90>

In [11]:
loss, accuracy = model1.evaluate(x_test, y_test, verbose=0)
print("Final cross validation accuracy =", accuracy)

Final cross validation accuracy = 0.6055307388305664


# Convolutional LSTM

In [12]:
model2 = Sequential()
model2.add(Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len, embeddings_regularizer=W_reg))
model2.add(Conv1D(kernel_size=5, filters=256, padding='same', activation='elu'))
model2.add(MaxPooling1D(pool_size=4))
model2.add(Dropout(0.5))
model2.add(LSTM(lstm_output_size))
model2.add(Dropout(0.5))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model2.fit(x_train, target_train, epochs=epochs_num, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdddf06d1d0>

In [13]:
loss, accuracy = model2.evaluate(x_test, y_test, verbose=0)
print("Final cross validation accuracy =", accuracy)

Final cross validation accuracy = 0.9819999933242798


# Simple LSTM

In [14]:
model3 = Sequential()
model3.add(Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len, embeddings_regularizer=W_reg))
model3.add(LSTM(lstm_output_size))
model3.add(Dense(1, activation='sigmoid'))

print(model3.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 75, 32)            3232      
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_4 (Dense)             (None, 1)                 33        
                                                                 
Total params: 11,585
Trainable params: 11,585
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
x_valid, y_valid = x_train[:batch_size], target_train[:batch_size]
x_train2, y_train2 = x_train[batch_size:], target_train[batch_size:]
model3.fit(x_train2, y_train2, validation_data=(x_valid, y_valid), batch_size=batch_size, epochs=epochs_num)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fddda8ca8d0>

In [16]:
loss, accuracy = model3.evaluate(x_test, target_test, verbose=0)
print("Final cross validation accuracy =", accuracy)

Final cross validation accuracy = 0.885200023651123
