COMP90073 Assignment 2 - Task 1

Name - Ribhav Shridhar

Studnet ID - 1037144


Preprocessing the netFlow Data and generating 3 different feature sets

In [None]:
import pandas as pd
import numpy as np

Ingesting the data set - Training 

And, Giving column names for better understanding and readability


In [None]:
training_data = pd.read_csv("training_data.csv", header=None)
training_data.columns = ['Date_Flow_Start', 'Duration','Protocol','Src_IP','Src_Port','Direction','Dst_IP','Dst_Port','State','Source_Service','Dest_Service','Total_Packets','BiDirection_Bytes','SrcToDst_Bytes']

**Preprocessing and feature engineering referenced from - https://github.com/antoinedelplace/Cyberattack-Detection**

Generating features for the 1st set



In [None]:
window_width = 120 
window_stride = 60 

training_data['Date_Flow_Start'] = pd.to_datetime(training_data['Date_Flow_Start']).astype(np.int64)*1e-9 # Changing date time data to Integer data type
datetime_start = training_data['Date_Flow_Start'].min()

training_data['Window_lower'] = (training_data['Date_Flow_Start']-datetime_start-window_width)/window_stride+1

training_data['Window_lower'].clip(lower=0)
training_data['Window_upper_excl'] = (training_data['Date_Flow_Start']-datetime_start)/window_stride+1

training_data = training_data.astype({"Window_lower": int, "Window_upper_excl": int})
training_data.drop('Date_Flow_Start', axis=1, inplace=True)

X = pd.DataFrame()
nb_windows = training_data['Window_upper_excl'].max()

In [None]:
for i in range(0, nb_windows):
    gb = training_data.loc[(training_data['Window_lower'] <= i) & (training_data['Window_upper_excl'] > i)].groupby('Src_IP')
    X = X.append(gb.size().to_frame(name='counts').join(gb.agg({'Src_Port':'nunique', 
                                                       'Dst_IP':'nunique', 
                                                       'Dst_Port':'nunique', 
                                                       'Duration':['sum', 'mean', 'std', 'max', 'median'],
                                                       'BiDirection_Bytes':['sum', 'mean', 'std', 'max', 'median'],
                                                       'SrcToDst_Bytes':['sum', 'mean', 'std', 'max', 'median']
                                                       })).reset_index().assign(window_id=i))

In [None]:
X.to_csv("X.csv") # Saving dataset before deleting
del(training_data)
X.columns = ["_".join(x) if isinstance(x, tuple) else x for x in X.columns.ravel()]
X.fillna(-1, inplace=True)

In [None]:
columns_to_normalize = list(X.columns.values)
columns_to_normalize.remove('Src_IP')
columns_to_normalize.remove('window_id')

Normalizing the data set to make data meaningful for modeling

In [None]:
def normalize_column(dt, column):
    mean = dt[column].mean()
    std = dt[column].std()
    print(mean, std)
    dt[column] = (dt[column]-mean) / std

In [None]:
normalize_column(X, columns_to_normalize)

In [None]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', 22):
    print(X.shape)
    print(X)
    print(X.dtypes)

Dropping the Source IP from the dataset and saving them in hdf for future reference

In [None]:
X.drop('Src_IP', axis=1).to_hdf('set1.h5', key="data", mode="w")  # Droping src_ip column from the data and saving file to use for set 3
np.save("set1_IP.npy", X['Src_IP'])
X = X.drop('Src_IP', axis=1)
X.to_csv("training_data_set1.csv")  # Exporting pre processesd data set to csv

Generating features of the 2nd set

In [None]:
def RU(df):
    if df.shape[0] == 1:
        return 1.0
    else:
        proba = df.value_counts()/df.shape[0]
        h = proba*np.log10(proba)
        return -h.sum()/np.log10(df.shape[0])

In [None]:
X = pd.DataFrame()
nb_windows = training_data['Window_upper_excl'].max()

In [None]:
for i in range(0, nb_windows):
    gb = training_data.loc[(training_data['Window_lower'] <= i) & (training_data['Window_upper_excl'] > i)].groupby('Src_IP')
    X = X.append(gb.agg({'Src_Port':[RU], 
                         'Dst_IP':[RU], 
                         'Dst_Port':[RU]}).reset_index())
    print(X.shape)

In [None]:
X.columns = ["_".join(x) if isinstance(x, tuple) else x for x in X.columns.ravel()]

In [None]:
columns_to_normalize = list(X.columns.values)
columns_to_normalize.remove('Src_IP_')

In [None]:
normalize_column(X, columns_to_normalize)

In [None]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', 22):
    print(X.shape)
    print(X)
    print(X.dtypes)

In [None]:
X.drop('Src_IP_', axis=1).to_hdf('set2.h5', key="data", mode="w") # Droping src_ip column from the data and saving file to use for set 3
np.save("set2_IP.npy", X['Src_IP_'])
X = X.drop('Src_IP_', axis=1)
X.to_csv("training_data_set2.csv") # Exporting pre processesd data set to csv




Set 3 is generated by using features generated in bot set1 and set2, on the basis of their correlation

In [None]:
X = pd.read_hdf('set1.h5', key='data')
X.reset_index(drop=True, inplace=True)

In [None]:
X2 = pd.read_hdf('set2.h5', key='data')
X2.reset_index(drop=True, inplace=True)

In [None]:
X = X.join(X2)
X.drop('window_id', axis=1, inplace=True)

In [None]:
X.to_csv("training_data_set3.csv")

In [None]:
data = pd.read_csv("/content/drive/My Drive/training_data_set3.csv",index_col=False)
data = data.drop(['Unnamed: 0'], axis=1)

In [None]:
c = df.corr()  # Calculating correlation

In [None]:
sns.heatmap(c)

In [None]:
columns = np.full((c.shape[0],), True, dtype=bool)
for i in range(c.shape[0]):
    for j in range(i+1, c.shape[0]):
        if c.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = data.columns[columns]

In [None]:
data_new = df[selected_columns]

In [None]:
data_new.to_csv("training_data_set3.csv") # Exporting pre processesd data set to csv