COMP90073 Assignment 2 - Task 1

Name : Ribhav Shridhar

Student ID : 1037144

Pre processing and feature engineering the test/validation NetFlow data

In [None]:
import pandas as pd # Importing required packages
import numpy as np
import datetime
import h5py
from scipy.stats import mode

Setting up window width and stride

In [None]:
window_width = 120 
window_stride = 60 

**Reading the validation dataset.**

In [None]:
valid_data = pd.read_csv("bin_labelled_test.csv")

Reading the labels for the validation dataset

In [None]:
valid_label = np.load("valid_label.npy")
valid_label_df = pd.DataFrame(data=valid_label)

Giving column names from the assignment spec for better understanding and readability

In [None]:
valid_data.columns = ['Date_Flow_Start', 'Duration','Protocol','Src_IP','Src_Port','Direction','Dst_IP','Dst_Port','State','Source_Service','Dest_Service','Total_Packets','BiDirection_Bytes','SrcToDst_Bytes', 'Label']

In [None]:
valid_data['Label'] = label_df[0]

**Preprocessing and feature engineering referenced from - https://github.com/antoinedelplace/Cyberattack-Detection**

In [None]:
def normalize_column(dt, column):
    mean = dt[column].mean()
    std = dt[column].std()
    print(mean, std)
    dt[column] = (dt[column]-mean) / std

Generating features for set 1

In [None]:
valid_data['Date_Flow_Start'] = pd.to_datetime(valid_data['Date_Flow_Start']).astype(np.int64)*1e-9
datetime_start = valid_data['Date_Flow_Start'].min()

valid_data['Window_lower'] = (valid_data['Date_Flow_Start']-datetime_start-window_width)/window_stride+1
valid_data['Window_lower'].clip(lower=0, inplace=True)
valid_data['Window_upper_excl'] = (valid_data['Date_Flow_Start']-datetime_start)/window_stride+1
valid_data = valid_data.astype({"Window_lower": int, "Window_upper_excl": int})
valid_data.drop('Date_Flow_Start', axis=1, inplace=True)

valid_label=valid_data['Label']

In [None]:
X = pd.DataFrame()
nb_windows = data['Window_upper_excl'].max()
print(nb_windows)

for i in range(0, nb_windows):
    gb = data.loc[(data['Window_lower'] <= i) & (data['Window_upper_excl'] > i)].groupby('Src_IP')
    X = X.append(gb.size().to_frame(name='counts').join(gb.agg({'Src_Port':'nunique', 
                                                       'Dst_IP':'nunique', 
                                                       'Dst_Port':'nunique', 
                                                       'Duration':['sum', 'mean', 'std', 'max', 'median'],
                                                       'BiDirection_Bytes':['sum', 'mean', 'std', 'max', 'median'],
                                                       'SrcToDst_Bytes':['sum', 'mean', 'std', 'max', 'median'],
                                                       'Label':lambda x: mode(x)[0]})).reset_index().assign(window_id=i))

In [None]:
X.columns = ["_".join(x) if isinstance(x, tuple) else x for x in X.columns.ravel()]
X.fillna(-1, inplace=True)

In [None]:
columns_to_normalize = list(X.columns.values)
columns_to_normalize.remove('Src_IP')
columns_to_normalize.remove('Label_<lambda>')
columns_to_normalize.remove('window_id')

normalize_column(X, columns_to_normalize)

with pd.option_context('display.max_rows', 10, 'display.max_columns', 22):
    print(X.shape)
    print(X)
    print(X.dtypes)

Saving data 

In [None]:
X.drop('Src_IP', axis=1).to_hdf('test_set1.h5', key="data", mode="w") # Droping src_ip column from the data and saving file to use for set 3
np.save("test_set1_IPs.npy", X['Src_IP'])
np.save("test_set1_Labels.npy", labels)
X = X.drop('Src_IP', axis=1)
X.to_csv("test_set1.csv") # Exporting pre processesd data set to csv

Generating features for set 2

In [None]:
def RU(df):
    if df.shape[0] == 1:
        return 1.0
    else:
        proba = df.value_counts()/df.shape[0]
        h = proba*np.log10(proba)
        return -h.sum()/np.log10(df.shape[0])

In [None]:
X = pd.DataFrame()
nb_windows = data['Window_upper_excl'].max()

In [None]:
for i in range(0, nb_windows):
    gb = data.loc[(data['Window_lower'] <= i) & (data['Window_upper_excl'] > i)].groupby('Src_IP')
    X = X.append(gb.agg({'Src_Port':[RU], 
                         'Dst_IP':[RU], 
                         'Dst_Port':[RU]}).reset_index())
    print(X.shape)

In [None]:
X.columns = ["_".join(x) if isinstance(x, tuple) else x for x in X.columns.ravel()]

In [None]:
columns_to_normalize = list(X.columns.values)
columns_to_normalize.remove('Src_IP_')
normalize_column(X, columns_to_normalize)

In [None]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', 22):
    print(X.shape)
    print(X)
    print(X.dtypes)

Saving the data set

In [None]:
X.drop('Src_IP_', axis=1).to_hdf('test_set2.h5', key="data", mode="w")  # Droping src_ip column from the data and saving file to use for set 3
np.save("test_set2_IPs.npy", X['Src_IP_'])
np.save("test_set2_labels.npy", labels)
X = X.drop('Src_IP_', axis=1)
X.to_csv("test_set2.csv") # Exporting pre processesd data set to csv

Set 3 is generated by using features generated in bot set1 and set2, on the basis of their correlation

In [None]:
X = pd.read_hdf('test_set1.h5', key='data')
X.reset_index(drop=True, inplace=True)

In [None]:
X2 = pd.read_hdf('test_set2.h5', key='data')
X2.reset_index(drop=True, inplace=True)

In [None]:
X = X.join(X2)
X.drop('window_id', axis=1, inplace=True)
X.to_csv("test_set3.csv")

In [None]:
df = pd.read_csv("/content/drive/My Drive/test_set3.csv",index_col=False)
df = df.drop(['Unnamed: 0'], axis=1)
c = df.corr() # Calculating correlation
sns.heatmap(c)
columns = np.full((c.shape[0],), True, dtype=bool)
for i in range(c.shape[0]):
    for j in range(i+1, c.shape[0]):
        if c.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = df.columns[columns]

In [None]:
df_new = df[selected_columns]
df_new.to_csv("test_set3.csv")  # Exporting pre processed data to csv