# **Classification of DCN over HTTP**

In [66]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

import torch

from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score, average_precision_score, precision_recall_curve

## Load all dataframes

In [2]:
path1 = '/content/drive/MyDrive/DCN/l1-nondoh.csv'
path2 = '/content/drive/MyDrive/DCN/l2-benign.csv'
path3 = '/content/drive/MyDrive/DCN/l2-malicious.csv'

In [3]:
df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)
df3 = pd.read_csv(path3)

In [4]:
print('Non-DoH')
df1.head(2)

Non-DoH


Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,72.21.91.42,192.168.20.191,443,51041,2020-01-14 15:49:01,4.54404,11256331,2477164.0,159324,35062.191354,427496.394703,653.832084,1106.811615,1514.0,1514,-1.868316,-0.622772,0.590735,1.72011,1.31153,2.257566,2.246046,0.136072,0.026351,1.617572,0.580949,1.107852e-07,0.000333,2.7e-05,1.6e-05,1.4e-05,0.096659,0.038228,12.454823,NonDoH
1,192.168.20.191,195.201.169.48,51043,443,2020-01-14 15:49:03,8.171041,930,113.8166,11936,1460.768585,434289.817175,659.00669,677.157895,328.0,1514,1.589474,-1.269854,0.973195,7.408303,2.72182,1.559659,0.212176,0.212108,1.4852,0.495092,1.745138,3.224052,1.795565,0.886579,0.103698,0.0009,1.308024,0.493259,2.025273,NonDoH


In [5]:
print('Benign DoH')
df2.head(2)

Benign DoH


Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,7474.676771,86.456213,135.673751,102.0,54,1.168467,0.944683,0.637236,670.585814,25.895672,45.065277,48.811292,1.49506,-0.433974,1.682529,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,Benign
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,10458.118598,102.264943,141.245474,114.0,54,0.799261,0.853132,0.724023,708.465878,26.617022,52.287903,48.830314,31.719656,0.389704,0.772748,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,Benign


In [6]:
print('Malicious DoH')
df3.head(2)

Malicious DoH


Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.205,8.8.4.4,41018,443,2020-03-25 04:40:42,120.731389,46446,384.705257,85465,707.893786,5668.351021,75.288452,117.04614,107.0,68,0.400306,0.651443,0.643237,1290.726629,35.926684,57.744948,57.464928,47.056291,0.023383,0.297513,0.622162,7e-05,0.008369,0.01188,0.015244,0.015221,-1.205904,-0.39922,0.704513,Malicious
1,8.8.4.4,192.168.20.205,443,41018,2020-03-25 04:42:42,120.052751,470548,3919.510349,227661,1896.341384,8083.727768,89.909553,128.347243,107.0,68,0.712291,0.671199,0.700518,687.865454,26.227189,79.114598,79.944482,98.139201,-0.094926,-0.725377,0.331509,0.023756,0.15413,0.037553,4.9e-05,3.9e-05,0.729988,0.243394,4.104293,Malicious


### Non-DoH dataframe

In [7]:
print(df1.select_dtypes('object').head())

print('---------------------------------')

def unique_vals(column):
    return len(df1[column].unique())

unique_elems = list(map(unique_vals, df1.select_dtypes('object').columns))

for i in range(len(unique_elems)):
    print(df1.select_dtypes('object').columns[i], '-', unique_elems[i])


         SourceIP   DestinationIP            TimeStamp   Label
0     72.21.91.42  192.168.20.191  2020-01-14 15:49:01  NonDoH
1  192.168.20.191  195.201.169.48  2020-01-14 15:49:03  NonDoH
2  192.168.20.191    96.17.115.57  2020-01-14 15:49:03  NonDoH
3  192.168.20.191   96.114.14.140  2020-01-14 15:49:03  NonDoH
4  192.168.20.191   23.78.199.198  2020-01-14 15:49:03  NonDoH
---------------------------------
SourceIP - 6755
DestinationIP - 33718
TimeStamp - 132811
Label - 1


In [8]:
df1[['SourceIP' + str(i+1) for i in range(4)]] = [IPs.split('.') for IPs in df1.SourceIP]
df1[['DestinationIP' + str(i+1) for i in range(4)]] = [IPs.split('.') for IPs in df1.DestinationIP]

df1['TimeStamp'] = pd.to_datetime(df1['TimeStamp'], format = '%Y-%m-%d %H:%M:%S')

df1['Year'] = df1['TimeStamp'].dt.year
df1['Month'] = df1['TimeStamp'].dt.month
df1['Day'] = df1['TimeStamp'].dt.day
df1['Hour'] = df1['TimeStamp'].dt.hour
df1['Minute'] = df1['TimeStamp'].dt.minute
df1['Second'] = df1['TimeStamp'].dt.second

df1['Label'] = 0

df1 = df1.drop(['SourceIP', 'DestinationIP', 'TimeStamp'], axis = 1)

for cols in df1.columns:
    df1[cols] = df1[cols].astype('float64')


df1.head(3)

Unnamed: 0,SourcePort,DestinationPort,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label,SourceIP1,SourceIP2,SourceIP3,SourceIP4,DestinationIP1,DestinationIP2,DestinationIP3,DestinationIP4,Year,Month,Day,Hour,Minute,Second
0,443.0,51041.0,4.54404,11256331.0,2477164.0,159324.0,35062.191354,427496.394703,653.832084,1106.811615,1514.0,1514.0,-1.868316,-0.622772,0.590735,1.72011,1.31153,2.257566,2.246046,0.136072,0.026351,1.617572,0.580949,1.107852e-07,0.000333,2.7e-05,1.6e-05,1.4e-05,0.096659,0.038228,12.454823,0.0,72.0,21.0,91.0,42.0,192.0,168.0,20.0,191.0,2020.0,1.0,14.0,15.0,49.0,1.0
1,51043.0,443.0,8.171041,930.0,113.8166,11936.0,1460.768585,434289.817175,659.00669,677.157895,328.0,1514.0,1.589474,-1.269854,0.973195,7.408303,2.72182,1.559659,0.212176,0.212108,1.4852,0.495092,1.745138,3.224052,1.795565,0.886579,0.103698,0.0009,1.308024,0.493259,2.025273,0.0,192.0,168.0,20.0,191.0,195.0,201.0,169.0,48.0,2020.0,1.0,14.0,15.0,49.0,3.0
2,51021.0,443.0,0.033776,162.0,4796.305,138.0,4085.741355,86.4,9.29516,60.0,54.0,54.0,1.936492,0.645497,0.154919,0.000182,0.013489,0.026978,0.033684,0.0,-1.491488,1.999993,0.500002,0.0,0.0,0.033683,0.033683,0.033683,-10.0,-10.0,0.0,0.0,192.0,168.0,20.0,191.0,96.0,17.0,115.0,57.0,2020.0,1.0,14.0,15.0,49.0,3.0


### DoH dataframe

In [9]:
df4 = pd.concat([df2, df3])


print(df4.select_dtypes('object').head())

print('---------------------------------')

def unique_vals(column):
    return len(df4[column].unique())

unique_elems = list(map(unique_vals, df4.select_dtypes('object').columns))

for i in range(len(unique_elems)):
    print(df4.select_dtypes('object').columns[i], '-', unique_elems[i])


          SourceIP    DestinationIP            TimeStamp   Label
0   192.168.20.191  176.103.130.131  2020-01-14 15:49:11  Benign
1   192.168.20.191  176.103.130.131  2020-01-14 15:50:52  Benign
2   192.168.20.191  176.103.130.131  2020-01-14 15:52:55  Benign
3   192.168.20.191  176.103.130.131  2020-01-14 15:54:56  Benign
4  176.103.130.131   192.168.20.191  2020-01-14 15:56:46  Benign
---------------------------------
SourceIP - 21
DestinationIP - 21
TimeStamp - 144547
Label - 2


In [10]:
df4[['SourceIP' + str(i+1) for i in range(4)]] = [IPs.split('.') for IPs in df4.SourceIP]
df4[['DestinationIP' + str(i+1) for i in range(4)]] = [IPs.split('.') for IPs in df4.DestinationIP]

df4['TimeStamp'] = pd.to_datetime(df4['TimeStamp'], format = '%Y-%m-%d %H:%M:%S')

df4['Year'] = df4['TimeStamp'].dt.year
df4['Month'] = df4['TimeStamp'].dt.month
df4['Day'] = df4['TimeStamp'].dt.day
df4['Hour'] = df4['TimeStamp'].dt.hour
df4['Minute'] = df4['TimeStamp'].dt.minute
df4['Second'] = df4['TimeStamp'].dt.second

df4['Label'] = (df4['Label'] == 'Benign') * 1 

df4 = df4.drop(['SourceIP', 'DestinationIP', 'TimeStamp'], axis = 1)

for cols in df4.columns:
    df4[cols] = df4[cols].astype('float64')

df4.head(3)

Unnamed: 0,SourcePort,DestinationPort,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label,SourceIP1,SourceIP2,SourceIP3,SourceIP4,DestinationIP1,DestinationIP2,DestinationIP3,DestinationIP4,Year,Month,Day,Hour,Minute,Second
0,50749.0,443.0,95.08155,62311.0,655.342703,65358.0,687.388878,7474.676771,86.456213,135.673751,102.0,54.0,1.168467,0.944683,0.637236,670.585814,25.895672,45.065277,48.811292,1.49506,-0.433974,1.682529,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,1.0,192.0,168.0,20.0,191.0,176.0,103.0,130.0,131.0,2020.0,1.0,14.0,15.0,49.0,11.0
1,50749.0,443.0,122.309318,93828.0,767.136973,101232.0,827.672018,10458.118598,102.264943,141.245474,114.0,54.0,0.799261,0.853132,0.724023,708.465878,26.617022,52.287903,48.830314,31.719656,0.389704,0.772748,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,1.0,192.0,168.0,20.0,191.0,176.0,103.0,130.0,131.0,2020.0,1.0,14.0,15.0,50.0,52.0
2,50749.0,443.0,120.958413,38784.0,320.639127,38236.0,316.108645,7300.293933,85.441758,133.715278,89.0,54.0,1.570027,0.932978,0.638983,1358.911235,36.863413,50.316114,39.770747,0.417528,0.858198,1.353607,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,1.0,192.0,168.0,20.0,191.0,176.0,103.0,130.0,131.0,2020.0,1.0,14.0,15.0,52.0,55.0


### Merging the dataframes

In [11]:
df5 = df4.copy()
df5.Label = 1
task1 =  pd.concat([df1, df5])

task2 = df4.copy()

## Base Model

In [12]:
print(task1.columns)

Index(['SourcePort', 'DestinationPort', 'Duration', 'FlowBytesSent',
       'FlowSentRate', 'FlowBytesReceived', 'FlowReceivedRate',
       'PacketLengthVariance', 'PacketLengthStandardDeviation',
       'PacketLengthMean', 'PacketLengthMedian', 'PacketLengthMode',
       'PacketLengthSkewFromMedian', 'PacketLengthSkewFromMode',
       'PacketLengthCoefficientofVariation', 'PacketTimeVariance',
       'PacketTimeStandardDeviation', 'PacketTimeMean', 'PacketTimeMedian',
       'PacketTimeMode', 'PacketTimeSkewFromMedian', 'PacketTimeSkewFromMode',
       'PacketTimeCoefficientofVariation', 'ResponseTimeTimeVariance',
       'ResponseTimeTimeStandardDeviation', 'ResponseTimeTimeMean',
       'ResponseTimeTimeMedian', 'ResponseTimeTimeMode',
       'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeSkewFromMode',
       'ResponseTimeTimeCoefficientofVariation', 'Label', 'SourceIP1',
       'SourceIP2', 'SourceIP3', 'SourceIP4', 'DestinationIP1',
       'DestinationIP2', 'DestinationIP3', 

In [13]:
task1.columns[task1.isnull().sum() != 0]

Index(['ResponseTimeTimeMedian', 'ResponseTimeTimeSkewFromMedian'], dtype='object')

In [15]:
shuffled = torch.randperm(len(task1))
train_ind = shuffled[:-len(shuffled)//10]
valid_ind = shuffled[-len(shuffled)//10:]

xx = torch.tensor(task1.drop(['Label', 'ResponseTimeTimeMedian', 'ResponseTimeTimeSkewFromMedian'], axis = 1).values.astype(np.float32))
train_xx = xx[train_ind]
valid_xx = xx[valid_ind]

yy = torch.tensor(task1['Label'].values.astype(np.float32))
train_yy = yy[train_ind].reshape((len(train_ind), 1))
valid_yy = yy[valid_ind].reshape((len(valid_ind), 1))


In [16]:
print('Train Shape:', train_xx.shape)
print('Valid Shape:', valid_xx.shape)

Train Shape: torch.Size([1050422, 43])
Valid Shape: torch.Size([116714, 43])


In [77]:
def evaluate(predict, target, shall_i_print = True, shall_i_roc = False):

    sigmoid = torch.nn.Sigmoid()
    predict =  torch.round(sigmoid(predict)).float()
    target = target.float()

    report = classification_report(target.detach().numpy(), predict.detach().numpy(), labels=[0, 1])

    f_0 = int(report.split('.')[3].split(' ')[0])/100
    f_1 = int(report.split('.')[6].split(' ')[0])/100

    if shall_i_print == True:
        print(report)

    if shall_i_roc == True:
        fpr, tpr, thresholds = roc_curve(labels, roc_pred)
        plt.figure(figsize=(7, 7))
        plt.plot(fpr, tpr)
        plt.scatter(fpr, tpr)
        plt.title("ROC Curve")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
    
    return f_0, f_1

In [78]:
def accuracy(predict, target):

    sigmoid = torch.nn.Sigmoid()
    predict =  torch.round(sigmoid(predict)).float()
    target = target.float()

    correct = sum(predict == target)

    return np.round(np.round(correct/len(predict), 3).item(), 3)

In [79]:
class BaseModel_LR(torch.nn.Module):
    
    def __init__(self, num_feat):
        super().__init__()
        self.linear = torch.nn.Linear(num_feat, 1)
    
    def forward(self, x):
        prediction = self.linear(x)

        return prediction

In [81]:
num_feat = xx.shape[1]
BaseModel = BaseModel_LR(num_feat)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(BaseModel.parameters(), lr = 10e-4)

for epoch in range(3001):
    
    BaseModel.train()

    train_pred = BaseModel(train_xx)
    train_loss = criterion(train_pred, train_yy)

    train_loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    BaseModel.eval()

    valid_pred = BaseModel(valid_xx)
    valid_loss = criterion(valid_pred, valid_yy)

    if epoch % 300 == 0:
        print('Epoch:', epoch, 'Training Loss:', np.round(train_loss.data.item(), 3), 'Training F1 scores:', evaluate(train_pred, train_yy, False),
              'Validation Loss:', np.round(valid_loss.data.item(), 3), 'Validation F1 scores:', evaluate(valid_pred, valid_yy, False))


Epoch: 0 Training Loss: 8259.397 Training F1 scores: (0.64, 0.23) Validation Loss: 7878.286 Validation F1 scores: (0.65, 0.22)
-------------------------------------------------
Epoch: 300 Training Loss: 42.741 Training F1 scores: (0.9, 0.68) Validation Loss: 34.467 Validation F1 scores: (0.9, 0.68)
-------------------------------------------------
Epoch: 600 Training Loss: 4.625 Training F1 scores: (0.97, 0.92) Validation Loss: 3.504 Validation F1 scores: (0.98, 0.92)
-------------------------------------------------
Epoch: 900 Training Loss: 1.524 Training F1 scores: (0.98, 0.93) Validation Loss: 1.319 Validation F1 scores: (0.98, 0.93)
-------------------------------------------------
Epoch: 1200 Training Loss: 1.327 Training F1 scores: (0.98, 0.94) Validation Loss: 1.115 Validation F1 scores: (0.98, 0.94)
-------------------------------------------------
Epoch: 1500 Training Loss: 1.672 Training F1 scores: (0.98, 0.94) Validation Loss: 1.983 Validation F1 scores: (0.98, 0.93)
------