# Network Forensic Data

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# First glance at the data

We will perform a preliminary analysis of the data set containing intrusion detection information.

In [18]:
data_path = 'UNSW-NB15_1.csv'
data = pd.read_csv(data_path, na_values=['?'])

feat_path = 'NUSW-NB15_features.csv'
features = pd.read_csv(feat_path,sep=",", encoding='cp1252')
attacks_names = features['Name']
data.columns = attacks_names

# How many records and attributes are there?
print('There are ' + str(data.shape[0]) + ' records')
print('There are ' + str(data.columns.size) + ' attributes:')

# What are the attributes?
data_types = data.dtypes
print(data_types)

# Let's have a look at the first couple of records
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


There are 700000 records
There are 49 attributes:
Name
srcip                object
sport                object
dstip                object
dsport               object
proto                object
state                object
dur                 float64
sbytes                int64
dbytes                int64
sttl                  int64
dttl                  int64
sloss                 int64
dloss                 int64
service              object
Sload               float64
Dload               float64
Spkts                 int64
Dpkts                 int64
swin                  int64
dwin                  int64
stcpb                 int64
dtcpb                 int64
smeansz               int64
dmeansz               int64
trans_depth           int64
res_bdy_len           int64
Sjit                float64
Djit                float64
Stime                 int64
Ltime                 int64
Sintpkt             float64
Dintpkt             float64
tcprtt              float64
synack              f

Name,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
1,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
2,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
3,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0
4,59.166.0.0,32119,149.171.126.9,111,udp,CON,0.078339,568,312,31,...,0,2,4,2,3,1,1,2,,0


The describe method shows basic statistical characteristics of each numerical feature in a data frame (int64 and float64 types): number of non-missing values, mean, standard deviation, range, median, 0.25 and 0.75 quartiles.

In [19]:
data.describe()

Name,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label
count,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,...,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0
mean,0.863392,5009.643,48330.44,38.513859,31.980787,6.330453,21.326301,6024278.0,3011026.0,41.7467,...,0.019623,0.031116,6.380244,6.034167,4.153237,4.668596,1.685499,1.401911,2.188107,0.031736
std,25.935783,44898.71,186051.7,39.996983,30.450778,18.855991,64.733418,51814950.0,4499955.0,80.198772,...,0.1387,0.278212,5.075149,4.635611,3.74044,4.189669,1.871053,1.426014,2.2112,0.175296
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,0.003931,424.0,304.0,31.0,29.0,0.0,0.0,78858.79,76404.35,4.0,...,0.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0
50%,0.027594,2024.0,3380.0,31.0,29.0,4.0,5.0,536271.8,668675.9,16.0,...,0.0,0.0,5.0,5.0,3.0,4.0,1.0,1.0,1.0,0.0
75%,0.346519,3806.0,22432.0,31.0,29.0,7.0,16.0,1078489.0,3856223.0,52.0,...,0.0,0.0,9.0,8.0,5.0,6.0,2.0,1.0,3.0,0.0
max,8786.637695,13677390.0,14655420.0,255.0,254.0,5096.0,5483.0,5268000000.0,128761900.0,10200.0,...,1.0,8.0,44.0,42.0,42.0,50.0,36.0,34.0,38.0,1.0


In order to obtain statistics for nominal features, you should explicitly specify that you want to consider the `object` data type. For each nominal feature in the data, you will get the number of non-missing values, unique values count, most common value and the corresponding count:

In [20]:
data.describe(include=['object'])

Name,srcip,sport,dstip,dsport,proto,state,service,attack_cat
count,700000,700000,700000,700000,700000,700000,700000,22215
unique,40,100272,44,87819,135,16,13,9
top,59.166.0.2,0,149.171.126.4,53,tcp,FIN,-,Generic
freq,67209,6057,67331,107483,494737,487911,430656,7522


# Dealing with missing values

You can notice that a few columns contain missing values. Are they missing at random or is there a pattern? Let's start with the `attack_cat` attribute.

In [41]:
data[data['attack_cat'].isnull()].describe(include=['object'])

Name,srcip,sport,dstip,dsport,proto,state,service,attack_cat
count,677785,677785,677785,677785,677785,677785,677785,0.0
unique,40,98657,44,87444,9,16,9,0.0
top,59.166.0.2,0,149.171.126.4,53,tcp,FIN,-,
freq,67209,5494,67331,105228,485236,478465,418937,


From the table above we can see that `attack_cat`' is missing for everywhere. We ignore it.

In [44]:
#data['attack_cat'].fillna(value='f',inplace=True)
prev_num_cols = len(data.columns)
data.dropna(axis='columns', thresh=len(data.index) * 0.40, inplace=True)
print("Removed " + str(prev_num_cols - len(data.columns)) + " columns with all NaN values.")

Removed 1 columns with all NaN values.


# Dealing with abnormally distibuted columns

1. No Variation

In [45]:
cols_to_drop = []

for col in data:
    if not data[col].nunique() > 1:
        cols_to_drop.append(col)
        
data.drop(columns=cols_to_drop, inplace=True)
print("Removed " + str(len(cols_to_drop)) +
      " columns with no variation in its values.")
print("DataFrame's current shape: " + str(data.shape))

2. Too much variation

In [47]:
cols_to_drop.clear()

for col in data:
    if data[col].nunique() >= (len(data.index) * 0.50):
        cols_to_drop.append(col)

data.drop(columns=cols_to_drop, inplace=True)
print("Removed " + str(len(cols_to_drop)) +
      " columns with over 50% variation in its values")

Removed 8 columns with over 50% variation in its values


# Summary Tables

In [49]:
data['srcip'].value_counts(normalize=True)

59.166.0.2         0.096013
59.166.0.0         0.095896
59.166.0.5         0.095844
59.166.0.4         0.095317
59.166.0.1         0.095124
59.166.0.3         0.094493
59.166.0.6         0.092413
59.166.0.8         0.092343
59.166.0.9         0.091696
59.166.0.7         0.091036
175.45.176.1       0.020464
149.171.126.18     0.008586
175.45.176.3       0.007326
175.45.176.0       0.006831
175.45.176.2       0.004623
10.40.85.1         0.002400
10.40.182.1        0.002386
10.40.85.30        0.001269
10.40.170.2        0.001249
10.40.182.3        0.001249
149.171.126.1      0.000359
149.171.126.5      0.000356
149.171.126.6      0.000344
149.171.126.2      0.000331
149.171.126.3      0.000330
149.171.126.4      0.000321
149.171.126.8      0.000310
149.171.126.9      0.000310
149.171.126.7      0.000296
149.171.126.0      0.000274
192.168.241.243    0.000154
149.171.126.11     0.000023
149.171.126.15     0.000011
149.171.126.16     0.000009
149.171.126.10     0.000004
149.171.126.19     0

In [50]:
pd.crosstab(data['proto'], data['state'],normalize=True)

state,ACC,CLO,CON,ECO,ECR,FIN,INT,MAS,PAR,REQ,RST,TST,TXD,URH,URN,no
proto,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
3pc,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a/n,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aes-sp3-d,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
any,0.0,0.0,0.0,0.0,0.0,0.0,0.00006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
argus,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wsn,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xnet,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xns-idp,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xtp,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Data visualisation
Visualisation is the primary way to get a high-level understanding of the data. We use `matplotlib` as the plotting engine, whereas `seaborn` provides a plethora of convenient shortcuts to most common plotting tasks. The following snippet imports these packages.

In [56]:
#%matplotlib inline
import matplotlib as plt
#import seaborn as sns