In [1]:
# Importing the Required Libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [1]:
# Importing the train dataset
df_train = pd.read_csv("../input/dataset-of-malicious-and-benign-webpages/Webpages_Classification_train_data.csv/Webpages_Classification_train_data.csv")
df_train.drop(columns = "Unnamed: 0", inplace = True)

# ISO Alpha 3 code dataset
count = pd.read_csv('../input/iso-alpha-3/tableconvert_csv_pkcsig.csv')

# Importing the test dataset
df_test = pd.read_csv("../input/dataset-of-malicious-and-benign-webpages/Webpages_Classification_test_data.csv/Webpages_Classification_test_data.csv")
df_test.drop(columns = "Unnamed: 0", inplace = True)

In [1]:
df_train.head()

In [1]:
df_test.head()

In [1]:
count.head()

In [1]:
print ("The shape of the train dataset : ", df_train.shape)
print ("The shape of the test dataset : ", df_test.shape)

In [1]:
# Adding a feature which is the ISO_ALPHA_3 code of the countries
countries = dict(zip(count['Country'], count['Alpha-3 code']))

df_train['iso_3'] = df_train['geo_loc']
df_train['iso_3'].replace(countries, inplace = True)
df_train.head()

In [1]:
# Doing some preprocessing before Exploratory Data Analysis to get the best DA
df_train.https.replace({'yes' : 'HTTPS', 'no' : 'HTTP'}, inplace = True)
df_train.head()

In [1]:
# Class with some preprocessing functions to create some features
class preproc:
    
    # Counting the Special Characters in the content
    def count_special(string):
        count = 0
        for char in string:
            if not(char.islower()) and not(char.isupper()) and not(char.isdigit()):
                if char != ' ':
                    count += 1
        return count
    
    # Identifying the type of network [A, B, C]
    def network_type(ip):
        ip_str = ip.split(".")
        ip = [int(x) for x in ip_str]

        if ip[0]>=0 and ip[0]<=127:
            return (ip_str[0], "A")
        elif ip[0]>=128 and ip[0]<=191:
            return (".".join(ip_str[0:2]), "B")
        else:
            return (".".join(ip_str[0:3]), "C")

#### Visit this website for more info on [Network Classes](https://docs.oracle.com/cd/E19504-01/802-5753/6i9g71m2o/index.html#planning3-fig-11)

In [1]:
# Adding Feature that shows the Network type
df_train['Network']= df_train['ip_add'].apply(lambda x : preproc.network_type(x))
df_train['net_part'], df_train['net_type'] = zip(*df_train.Network)
df_train.drop(columns = ['Network'], inplace = True)

# Adding Feature that shows the Number of Special Character in the Content
df_train['special_char'] = df_train['content'].apply(lambda x: preproc.count_special(x))
df_train.head()

In [1]:
# Length of the Content
df_train['content_len'] = df_train['content'].apply(lambda x: len(x))
df_train.head()

In [1]:
df_train.label.replace({'bad' : 'Malicious', 'good' : 'Benign'}, inplace = True)
df_train.head()

## Exploratory Data Analysis

### Distribution of Webpage types

In [1]:
# Setting the parameters
plt.rcParams['figure.figsize'] = [18, 8]
sns.set(style = 'white', font_scale = 1.3)
fig, ax = plt.subplots(1, 2)

# Bar graph
bar = sns.countplot(df_train.label, data = df_train, ax = ax[0], palette = ['coral', 'mediumorchid'])
bar.set(xlabel = 'Webpage Type', ylabel = 'Count')
bar.set_title("Distribution of Malicious and Benign Webpage", bbox={'facecolor':'0.8', 'pad':5})

# Creating the labels for the piechart
types = df_train['label'].value_counts()
labels = list(types.index)
aggregate = list(types.values)
# percentage = [(x*100)/sum(aggregate) for x in aggregate]
# print ("The percentages of Benign and Malicious Webpages are : ", percentage)

# Plotting the Piechart to see the percentage distribution of the Webpages
plt.rcParams.update({'font.size': 16})
explode = (0, 0.1)
ax[1].pie(aggregate, labels = labels, autopct='%1.2f%%', shadow=True, explode = explode, colors = ['coral', 'mediumorchid'])
plt.title("Pie Chart for Malicious and Benign Webpage", bbox={'facecolor':'0.8', 'pad':5})
plt.legend(labels, loc = 'best')
plt.tight_layout()
plt.show()

We can see that the dataset is highly skewed so choosing the evaluation metrics is an important step as only accuracy will be biased as if model predicts every webpage as Benign still the accuracy will be very high. So we'll focus on the confusion matrix, F1 Score, Precision and Recall.

### Choropleth Map showing the Malicious Webpages

In [1]:
# Iso alpha 3 codes are necessary for choropleth plots [not particulary necessary though :)]
df = df_train.loc[df_train.label == 'Malicious', :]

val = df.iso_3.value_counts()
val = pd.DataFrame(val)
val

In [1]:
# Choropleth Map
fig = go.Figure(data = go.Choropleth(
    locations = val.index,
    z = val.iso_3,
#     text = val.index,
    colorscale = 'reds',
    autocolorscale = False,
    reversescale = False,
    marker_line_color = 'darkgray',
    marker_line_width = 0.5,
    colorbar_title = 'Number of Webpages',
))

fig.update_layout(
    title_text = 'Malicious Webpages Around the World',
    geo = dict(
        showframe = False,
        showcoastlines = False,
#         projection_type = 'equirectangular'
    ),
    annotations = [dict(
        x = 0.55,
        y = 0.1,
        xref = 'paper',
        yref = 'paper',
        text = 'Source: <a href="https://data.mendeley.com/datasets/gdx3pkwp47/2">\
            Dataset [Mendeley Data] </a>',
        showarrow = False
    )]
)

fig.show()

### Choropleth Map showing the Benign Webpages

In [1]:
df = df_train.loc[df_train.label == 'Benign', :]

val = df.iso_3.value_counts()
val = pd.DataFrame(val)
val

In [1]:
# Choropleth Map
fig = go.Figure(data = go.Choropleth(
    locations = val.index,
    z = val.iso_3,
#     text = val.index,
    colorscale = 'blues',
    autocolorscale = False,
    reversescale = False,
    marker_line_color = 'darkgray',
    marker_line_width = 0.5,
    colorbar_title = 'Number of Webpages',
))

fig.update_layout(
    title_text = 'Benign Webpages Around the World',
    geo = dict(
        showframe = False,
        showcoastlines = False,
#         projection_type = 'equirectangular'
    ),
    annotations = [dict(
        x = 0.55,
        y = 0.1,
        xref = 'paper',
        yref = 'paper',
        text = 'Source: <a href="https://data.mendeley.com/datasets/gdx3pkwp47/2">\
            Dataset [Mendeley Data] </a>',
        showarrow = False
    )]
)

fig.show()

### URL Length (url_len) Distribution

In [1]:
# Segregating the dataset for easy ploting
df_mal = df_train.loc[df_train.label == 'Malicious', :]
df_ben = df_train.loc[df_train.label == 'Benign', :]

In [1]:
plt.rcParams['figure.figsize'] = [15, 7]

fig, ax = plt.subplots(2, 2)

mal = sns.distplot(df_mal['url_len'], color = 'r', hist = True, rug = False, kde = False, ax = ax[0,0])
mal.set(title = 'Malicious Webpage URL length distribution')
ben = sns.distplot(df_ben['url_len'], color = 'b', hist = True, rug = False, kde = False, ax = ax[0,1])
ben.set(title = 'Benign Webpage URL length distribution')

sns.kdeplot(df_mal['url_len'], color = 'r', fill = True, ax = ax[1,0])
sns.kdeplot(df_ben['url_len'], color = 'b', fill = True, ax = ax[1,1])

plt.tight_layout()
plt.show()

### Bar graph showing the most used TLD in URLs over Malicious and Benign Webpages

In [1]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

mal_tld = pd.DataFrame(df_mal.tld.value_counts()[:10])
ben_tld = pd.DataFrame(df_ben.tld.value_counts()[:10])

mal = go.Bar(y = mal_tld.tld, x = mal_tld.index.tolist(), text = mal_tld.tld, marker_color = 'indianred')
ben = go.Bar(y = ben_tld.tld, x = ben_tld.index.tolist(), text = ben_tld.tld, marker_color = 'lightsalmon')

fig = make_subplots(
    rows = 2, cols = 1, subplot_titles = ("Most occuring Top Level Domain in Malicious Webpages", "Most occuring Top Level Domain in Benign Webpages"),
    specs = [[{'type' : 'bar'}], [{'type' : 'bar'}]]
)

fig.append_trace(mal, row = 1, col = 1)
fig.append_trace(ben, row = 2, col = 1)
fig.update_traces(texttemplate = '%{text:.2s}', textposition = 'outside')
fig.update_layout(uniformtext_minsize = 10, uniformtext_mode = 'hide', width = 1200, height = 900)

fig.show()

### Content Length Distribution over Malicious and Benign Webpages

In [1]:
plt.rcParams['figure.figsize'] = [20, 10]
sns.set(style = 'whitegrid')

fig, ax = plt.subplots(1, 2)
kde = sns.kdeplot(df_train.content_len, data = df_train, hue = 'label', vertical = False, fill = True, palette = ['seagreen', 'darkred'], ax = ax[0])
kde.set(title = 'Kernel Distribution Estimation of Content Length', xlabel = 'Content Length')

vio = sns.violinplot(y = df_train.content_len, x = df_train.label, data = df_train, palette = ['seagreen', 'darkred'], ax = ax[1])
vio.set(title = 'Violin Plot of Content Length', ylabel = 'Content Length', xlabel = 'Webpage Type')

print ("Mean Content Length of Malicious Webpage : ", df_train.loc[df_train.label == 'Malicious', 'content_len'].mean())
print ("Mean Content Length of Benign Webpage    : ", df_train.loc[df_train.label == 'Benign', 'content_len'].mean())
print ()

Hmm Interesting Malicious Webpages have large content length

### js_len and js_obf_len Distribution over Malicious and Benign Webpages

In [1]:
# js_obf_len
fig, ax = plt.subplots(2, 2)

mal = sns.distplot(df_mal.js_obf_len, hist = True, rug = False, kde = False, color = 'r', ax = ax[0,0])
mal.set(title = 'Obf JS Length Distribution for Malicious Webpages')
ben = sns.distplot(df_ben.js_obf_len, hist = True, rug = False, kde = False, color = 'b', ax = ax[0,1])
ben.set(title = 'Obf JS Length Distribution for Benign Webpages')

sns.kdeplot(df_mal.js_obf_len, data = df_train, fill = True, color = 'r', ax = ax[1,0])
sns.kdeplot(df_ben.js_obf_len, data = df_train, fill = True, color = 'b', ax = ax[1,1])

Here, it can be seen that Malicious Webpages have more obfuscated javascript code, NOICE!

In [1]:
# js_len
fig, ax = plt.subplots(2, 2)

mal = sns.distplot(df_mal.js_len, hist = True, rug = False, kde = False, color = 'r', ax = ax[0,0])
mal.set(title = 'JS Length Distribution for Malicious Webpages')
ben = sns.distplot(df_ben.js_len, hist = True, rug = False, kde = False, color = 'b', ax = ax[0,1])
ben.set(title = 'JS Length Distribution for Benign Webpages')

sns.kdeplot(df_mal.js_len, data = df_train, fill = True, color = 'r', ax = ax[1,0])
sns.kdeplot(df_ben.js_len, data = df_train, fill = True, color = 'b', ax = ax[1,1])

In [1]:
# Violin Plot showing the relation
plt.rcParams['figure.figsize'] = [18, 8]
fig, ax = plt.subplots(1, 2)

PAG = sns.violinplot(x = df_train.label, y = df_train.js_obf_len, data = df_train, ax = ax[0])
PAG.set(title = 'Violin Plot for Obf JS length', xlabel = 'Webpage Type')
PAG_ = sns.violinplot(x = df_train.label, y = df_train.js_len, data = df_train, ax = ax[1])
PAG_.set(title = 'Violin Plot for JS length', xlabel = 'Webpage Type');

### HTTPS and HTTP protocols distribution

In [1]:
plt.rcParams['figure.figsize'] = [16, 8]
sns.set(style = 'whitegrid', font_scale = 1.2)

fig, ax = plt.subplots(1, 2)
bar_1 = sns.countplot(x = 'https', data = df_mal, order = ['HTTPS', 'HTTP'], palette = 'Set2', ax = ax[0])
bar_1.set(title = 'Bargraph for HTTPS Vs HTTP for Malicious Webpages', xlabel = 'Protocol')

bar_2 = sns.countplot(x = 'https', data = df_ben, order = ['HTTPS', 'HTTP'], palette = 'Set2', ax = ax[1])
bar_2.set(title = 'Bargraph for HTTPS Vs HTTP for Benign Webpages', xlabel = 'Protocol');

The Benign Webpages are more inclined to use HTTPS as it is more secure than the HTTP protocol and for Malicious Webpages, they uses HTTP more!!!

### Average Content length of the webpages around the world

In [1]:
con = pd.DataFrame(df_train.groupby('iso_3')['content_len'].mean())
con.head()

In [1]:
# Choropleth Map
fig = px.choropleth(con, locations = con.index.tolist(),
                    color = "content_len",
                    color_continuous_scale = px.colors.sequential.Viridis)
fig.show()

### Network types over Malicious and Benign Webpages

In [1]:
plt.rcParams['figure.figsize'] = [16, 8]
sns.set(style = 'whitegrid', font_scale = 1.2)

fig, ax = plt.subplots(1, 2)
bar_1 = sns.countplot(x = 'net_type', data = df_mal, order = ['A', 'B', 'C'], palette = 'YlOrBr', ax = ax[0])
bar_1.set(title = 'Network types in Malicious Webpages', xlabel = 'Network Type')

bar_2 = sns.countplot(x = 'net_type', data = df_ben, order = ['A', 'B', 'C'], palette = 'YlOrBr', ax = ax[1])
bar_2.set(title = 'Network types in Benign Webpages', xlabel = 'Network Type');

### Heatmap of Pearson Correlation

In [1]:
# These are the categorical features that needs to be converted into numeric features for modelling 
a = df_train.select_dtypes('object').columns.tolist()
ls = [element for element in a if element not in ['content', 'url', 'ip_add', 'label', 'net_part', 'iso_3']]
ls

In [1]:
# This le_dict will save the Label Encoder Class so that the same Label Encoder instance can be used for the test dataset
le_dict = {}

for feature in ls:
    le = LabelEncoder()
    le_dict[feature] = le
    df_train[feature] = le.fit_transform(df_train[feature])

df_train.label.replace({'Malicious' : 1, 'Benign' : 0}, inplace = True)
df_train.head()

In [1]:
# The Final Features which are going to be used for training
df_train = df_train[['url_len', 'geo_loc', 'tld', 'who_is', 'https', 'js_len', 'js_obf_len', 'label', 'net_type', 'special_char', 'content_len']]
df_train.head()

In [1]:
# Pearson Correlation Heatmap
plt.rcParams['figure.figsize'] == [18, 16]
sns.set(font_scale = 1)

sns.heatmap(df_train.corr(method = 'pearson'), annot = True, cmap = "YlGnBu");

## Preprocessing

In [1]:
# Normalizing the 'content_len' and 'special_char' in training data
ss_dict = {}

for feature in ['content_len', 'special_char']:
    ss = StandardScaler()
    ss_fit = ss.fit(df_train[feature].values.reshape(-1, 1))
    ss_dict[feature] = ss_fit
    d = ss_fit.transform(df_train[feature].values.reshape(-1, 1))
    df_train[feature] = pd.DataFrame(d, index = df_train.index, columns = [feature])

df_train.head()

In [1]:
# preprocessing the test dataset
# Replacing yes to HTTPS and no to HTTP
df_test.https.replace({'yes' : 'HTTPS', 'no' : 'HTTP'}, inplace = True)

# Replacing the label
df_test.label.replace({'bad' : 'Malicious', 'good' : 'Benign'}, inplace = True)

In [1]:
# Adding Feature that shows the Network type
df_test['Network']= df_test['ip_add'].apply(lambda x : preproc.network_type(x))
df_test['net_part'], df_test['net_type'] = zip(*df_test.Network)
df_test.drop(columns = ['Network'], inplace = True)

# Adding Feature that shows the Number of Special Character in the Content
df_test['special_char'] = df_test['content'].apply(lambda x: preproc.count_special(x))

In [1]:
# Length of the Content
df_test['content_len'] = df_test['content'].apply(lambda x: len(x))

In [1]:
# Using the same label encoders for the features as used in the training dataset
for feature in ls:
    le = le_dict[feature]
    df_test[feature] = le.fit_transform(df_test[feature])

df_test.label.replace({'Malicious' : 1, 'Benign' : 0}, inplace = True)
df_test.head()

In [1]:
# Normalizing the 'content_len' and 'special_char' in testing data
ss_fit = ss_dict['content_len']
d = ss_fit.transform(df_test['content_len'].values.reshape(-1, 1))
df_test['content_len'] = pd.DataFrame(d, index = df_test.index, columns = ['content_len'])

ss_fit = ss_dict['special_char']
d = ss_fit.transform(df_test['special_char'].values.reshape(-1, 1))
df_test['special_char'] = pd.DataFrame(d, index = df_test.index, columns = ['special_char'])

df_test.head()

In [1]:
df_test = df_test[['url_len', 'geo_loc', 'tld', 'who_is', 'https', 'js_len', 'js_obf_len', 'label', 'net_type', 'special_char', 'content_len']]
df_test.head()

## DNN Model using Pytorch

In [1]:
# Configuration Class
class config:
    BATCH_SIZE = 128
    DEVICE =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    LEARNING_RATE = 2e-5
    EPOCHS = 20

In [1]:
# Making the custom dataset for pytorch
class MaliciousBenignData(Dataset):
    def __init__(self, df):
        self.df = df
        self.input = self.df.drop(columns = ['label']).values
        self.target = self.df.label
        
    def __len__(self):
        return (len(self.df))
    
    def __getitem__(self, idx):
        return (torch.tensor(self.input[idx]), torch.tensor(self.target[idx]))

In [1]:
# Creating the dataloader for pytorch
def create_dataloader(df, batch_size):
    cls = MaliciousBenignData(df)
    return DataLoader(
        cls,
        batch_size = batch_size,
        num_workers = 0
    )

df_train_loader = create_dataloader(df_train, batch_size = config.BATCH_SIZE)
df_test_loader = create_dataloader(df_test, batch_size = 1) # Here for testing using the batch size as 1

In [1]:
# DataLoader components
df_train_loader.__dict__

In [1]:
# Making the DNN model
class dnn(nn.Module):
    def __init__(self):
        super(dnn, self).__init__()

        self.fc1 = nn.Linear(10, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 128)
        self.out = nn.Linear(128, 1)

        self.dropout1 = nn.Dropout(p = 0.2)        
        self.dropout2 = nn.Dropout(p = 0.3)
        self.batchn1 = nn.BatchNorm1d(num_features = 64)
        self.batchn2 = nn.BatchNorm1d(num_features = 128)

    def forward(self, inputs):

        t = self.fc1(inputs)
        t = F.relu(t)
        t = self.batchn1(t)
        t = self.dropout1(t)
        t = self.fc2(t)
        t = F.relu(t)
        t = self.batchn2(t)
        t = self.dropout2(t)
        t = self.fc3(t)
        t = F.relu(t)
        t = self.out(t)

        return t

In [1]:
# Transfer the model on the device -- 'GPU' if available or Default 'CPU'
model = dnn()
model.to(config.DEVICE)
print (model)

In [1]:
# Criterian and the Optimizer for the model
criterian = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = config.LEARNING_RATE)

In [1]:
# Simple Binary Accuracy Function
def binary_acc(predictions, y_test):
    y_pred = torch.round(torch.sigmoid(predictions))
    correct = (y_pred == y_test).sum().float()
    acc = torch.round((correct/y_test.shape[0])*100)
    return acc

In [1]:
# Training function

def train_model(model, device, data_loader, optimizer, criterian):
    # Putting the model in training mode
    model.train()

    for epoch in range(1, config.EPOCHS+1):
        epoch_loss = 0
        epoch_acc = 0
        for X, y in data_loader:

            X = X.to(device)
            y_ = torch.tensor(y.unsqueeze(1), dtype = torch.float32)
            y = y_.to(device)

            # Zeroing the gradient
            optimizer.zero_grad()

            predictions = model(X.float())

            loss = criterian(predictions, y)
            acc = binary_acc(predictions, y)

            loss.backward() # Calculate Gradient
            optimizer.step() # Updating Weights

            epoch_loss += loss.item()
            epoch_acc += acc.item()

        print (f"Epoch -- {epoch} | Loss : {epoch_loss/len(data_loader): .5f} | Accuracy : {epoch_acc/len(data_loader): .5f}")

In [1]:
# Evaluation Function

def eval_model(model, device, data_loader):
    # Putting the model in evaluation mode
    model.eval()

    y_pred = []
    y_test_al = []

    with torch.no_grad():
        for X_test, y_test in data_loader:
            X_test = X_test.to(device)

            predictions = model(X_test.float())
            pred = torch.round(torch.sigmoid(predictions))

            y_test_al.append(y_test.tolist())
            y_pred.append(pred.tolist())

        # Changing the Predictions into list 
        y_test_al = [ele[0] for ele in y_test_al]
        y_pred = [int(ele[0][0]) for ele in y_pred] # the format of the prediction is [[[0]], [[1]]]

        return (y_test_al, y_pred)

#### All Done, Training Time!!

In [1]:
# Training the Model
train_model(model, config.DEVICE, df_train_loader, optimizer, criterian)

In [1]:
# Evaluating the model and getting the predictions
y_test, preds = eval_model(model, config.DEVICE, df_test_loader)
print ('Predictions : ', preds[0:10])

In [1]:
# Classification Report
cls_report = metrics.classification_report(y_test, preds)

print ("")
print (f"Accuracy : {metrics.accuracy_score(y_test, preds)*100 : .3f} %") 
print ("")
print ("Classification Report : ")
print (cls_report)

In [1]:
plt.rcParams['figure.figsize'] = [10, 7]
sns.set(font_scale = 1.2)

# Confusion Matrix
cm = metrics.confusion_matrix(y_test, preds)

# Plotting the Confusion Matrix
ax = sns.heatmap(cm, annot = True, cmap = 'YlGnBu')
ax.set(title = "Confusion Matrix", xlabel = 'Predicted Labels', ylabel = 'True Labels');

#### Saving the model!!

In [1]:
torch.save(model.state_dict(), 'model.pth')