In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Input

In [None]:
root = "/kaggle/input/unsw-nb15/"
train = pd.read_csv(root+"UNSW_NB15_training-set.csv")
test = pd.read_csv(root+"UNSW_NB15_testing-set.csv")
list_events = pd.read_csv(root+"UNSW-NB15_LIST_EVENTS.csv")
features = pd.read_csv(root+"NUSW-NB15_features.csv", encoding='cp1252')

According to official site [here](https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/), train and test data have 175341 and 82332 rows respectively.

In [None]:
print(train.shape, test.shape)
if train.shape[0]<100000:
    print("Train test sets are reversed. Fixing them.")
    train, test = test, train

In [None]:
train['type'] = 'train'
test['type'] ='test'
total = pd.concat([train, test], axis=0, ignore_index=True)
total.drop(['id'], axis=1, inplace=True)
# del train, test

# Utils

In [None]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def standardize(df):
    return (df-df.mean())/df.std()
    
def min_max(df):
    return (df-df.min())/(df.max() - df.min())

def normalize(df):
    return pd.Dataframe(preprocessing.normalize(df), columns=df.columns)

In [None]:
total = reduce_mem_usage(total)

# List of Events

In [None]:
list_events.shape

In [None]:
list_events.head()

In [None]:
list_events['Attack category'].unique()

In [None]:
list_events['Attack subcategory'].unique()

# Features

In [None]:
features.head(features.shape[0])

In [None]:
# the Name column has camel case values
features['Name'] = features['Name'].str.lower()
# the following 4 columns are address related and not in train dataset
features = features[~features['Name'].isin(['srcip', 'sport', 'dstip', 'dsport'])].reset_index()
features.drop(['index', 'No.'], axis=1, inplace=True)

# Data

In [None]:
normal = train[train['label']==0]
anomaly = train[train['label']==1]

## Some difference with features file

In [None]:
print(sorted(set(train.columns) - set(features['Name'].values)))
print(sorted(set(features['Name'].values) - set(train.columns)))

Some of the column names in features file are wrong and we are going to fix them. 

In [None]:
fix = {'ct_src_ ltm': 'ct_src_ltm', 'dintpkt': 'dinpkt', 'dmeansz': 'dmean', 'res_bdy_len': 'response_body_len', 'sintpkt': 'sinpkt', 'smeansz': 'smean'}
features['Name'] = features['Name'].apply(lambda x: fix[x] if x in fix else x)
features.to_csv('features.csv')

In [None]:
print(sorted(set(train.columns) - set(features['Name'].values)))
print(sorted(set(features['Name'].values) - set(train.columns)))

Still there are some differences. `stime` and `ltime` both refers to when the recording stared and lasted. So they shouldn't be valuable in training, hence not being in train set makes sence. `id` is just row number and rate might be something related to packed sending speed or data rate.

## Checking data types

In [None]:
train.head()

In [None]:
train.dtypes

* categorical: state, service, proto
* target  = attack_cat, label
* integer but categorial = is_sm_ips_ports, ct_state_ttl, is_ftp_login
* integer = spkts, dpkts, sbytes, dbytes, sttl, dttl, sload, dload, sloss, dloss, swin, dwin, stcpb, dtcpb, smean, dmean, trans_depth, response_body_len, ct_srv_src, ct_state_ttl, ct_dst_ltm, ct_src_dport_ltm, ct_dst_sport_ltm, ct_dst_src_ltm, ct_ftp_cmd, ct_flw_http_mthd, ct_src_ltm, ct_srv_dst, 
* decimal = dur, rate, sinpkt, dinpkt, sjit, djit, tcprtt, synack, ackdat

# Correlation matrix
Why checking correlation is important ? Check these links:
* [Why Feature Correlation Matters …. A Lot!](https://towardsdatascience.com/why-feature-correlation-matters-a-lot-847e8ba439c4) and 
* [Feature selection — Correlation and P-value](https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf)

In [None]:
def show_correlation(data, method='pearson'):
    correlation_matrix = data.corr(method='pearson') #  ‘pearson’, ‘kendall’, ‘spearman’
    fig = plt.figure(figsize=(12,9))
    sns.heatmap(correlation_matrix,vmax=0.8,square = True) #  annot=True, if fig should show the correlation score too
    plt.show()
    return correlation_matrix

def top_correlations(correlations, limit=0.9):
    columns = correlations.columns
    for i in range(correlations.shape[0]):
        for j in range(i+1, correlations.shape[0]):
            if correlations.iloc[i,j] >= limit:
                print(f"{columns[i]} {columns[j]} {correlations.iloc[i,j]}")
def print_correlations(correlations, col1=None, col2=None):
    columns = correlations.columns
    for i in range(correlations.shape[0]):
        for j in range(i+1, correlations.shape[0]):
            if (col1 == None or col1==columns[i]) and (col2 == None or col2==columns[j]):
                print(f"{columns[i]} {columns[j]} {correlations.iloc[i,j]}")
                return
            elif (col1 == None or col1==columns[j]) and (col2 == None or col2==columns[i]):
                print(f"{columns[i]} {columns[j]} {correlations.iloc[i,j]}")
                return
            
def find_corr(df1, df2):
    return pd.concat([df1, df2], axis=1).corr().iloc[0,1]

def corr(col1, col2='label', df=total):
    return pd.concat([df[col1], df[col2]], axis=1).corr().iloc[0,1]

## Pearson

In [None]:
correlation_matrix = show_correlation(total)

In [None]:
top_correlations(correlation_matrix, limit=0.9)

## Spearman

In [None]:
correlation_matrix = show_correlation(train, method='spearman')

In [None]:
top_correlations(correlation_matrix, limit=0.9)

Most correlated features are :
* spkts, sbytes, sloss 
* dpkts, dbytes, dloss
* sinpkt, is_sm_ips_ports
* swin, dwin
* tcprtt, synack
* ct_srv_src, ct_srv_dst, ct_dst_src_ltm, ct_src_dport_ltm, ct_dst_sport_ltm 
* is_ftp_login ct_ftp_cmd

In [None]:
sns.pairplot(total[['spkts', 'sbytes', 'sloss']])

In [None]:
sns.pairplot(total[['dpkts', 'dbytes', 'dloss']])

In [None]:
sns.pairplot(total[['sinpkt', 'is_sm_ips_ports']])

In [None]:
sns.pairplot(total[['swin', 'dwin']])

# plot utils

In [None]:
def dual_plot(col, data1=normal, data2=anomaly, label1='normal', label2='anomaly', method=None):
    if method != None:
        sns.distplot(data1[col].apply(method), label=label1, hist=False, rug=True)
        sns.distplot(data2[col].apply(method), label=label2, hist=False, rug=True)
    else:
        sns.distplot(data1[col], label=label1, hist=False, rug=True)
        sns.distplot(data2[col], label=label2, hist=False, rug=True)
    plt.legend()
    
def catplot(data, col):
    ax = sns.catplot(x=col, hue="label", col="type",data=data, kind="count", height=5, legend=False, aspect=1.4)
    ax.set_titles("{col_name}")
    ax.add_legend(loc='upper right',labels=['normal','attack'])
    plt.show(ax)

# Categorical
These four columns are categorical: 'attack_cat', 'state', 'service', 'proto'. Among them 'attack_cat' isn't a feature.
These features are categorical but in integer form : 'is_sm_ips_ports', 'ct_state_ttl', 'is_ftp_login'.

In [None]:
def create_count_df(col, data=total):
    df = pd.DataFrame(data[col].value_counts().reset_index().values, columns = [col, 'count'])
    df['percent'] = df['count'].values*100/data.shape[0]
    return df.sort_values(by='percent', ascending=False)

## Label
0 for normal and 1 for attack records

In [None]:
create_count_df('label', train)

In [None]:
create_count_df('label', test)

So it seems the dataset is pretty balanced, unlike real world data where attack scenarios are rare. Moreover, here attack connections are more than normal connections.

## State
Indicates to the state and its dependent protocol, e.g. ACC, CLO, CON, ECO, ECR, FIN, INT, MAS, PAR, REQ, RST, TST, TXD, URH, URN, and (-) (if not used state)

In [None]:
col = 'state'
create_count_df(col, train)

In [None]:
# all other values those were few in train set, have been renamed to 'RST_and_others'
total.loc[~total[col].isin(['FIN', 'INT', 'CON', 'REQ', 'RST']), col] = 'others'
catplot(total, col)
# catplot(total[~total[col].isin(['INT', 'FIN', 'REQ', 'CON'])], col)

## Service
http, ftp, smtp, ssh, dns, ftp-data ,irc  and (-) if not much used service. More than half of the service data are of - category. 

In [None]:
col = 'service'
create_count_df(col, train)

In [None]:
catplot(total[~total[col].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3'])], col)

In [None]:
total.loc[~total[col].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3']), col] = 'others'

## proto
Transaction protocol. Normal connections of train data have only 5 protocols, where anomaly connections have 129. So we'll convert all other protocols into same value.

In [None]:
col = 'proto'
create_count_df(col, normal)

In [None]:
create_count_df(col, anomaly)[:10]

In [None]:
# icmp and rtp columns are in test, but not in train data
total.loc[total[col].isin(['igmp', 'icmp', 'rtp']), col] = 'igmp_icmp_rtp'
total.loc[~total[col].isin(['tcp', 'udp', 'arp', 'ospf', 'igmp_icmp_rtp']), col] = 'others'

## is_sm_ips_ports
If source and destination IP addresses equal and port numbers (sport/dport)  equal then, this variable takes value 1 else 0. Seems if it is 1, then the connection is always normal. This feature is highly correlated with sinpkt (0.94131890073567).

In [None]:
catplot(total, 'is_sm_ips_ports')

## is_ftp_login
If the ftp session is accessed by user and password then 1 else 0. In most of the cases session has no user and password. However there are values 2 and 4 which should not be there.

This feature is totally correlated with ct_ftp_cmd, which counts the number of ftp commands. So dropping this column should be ok.

In [None]:
col = 'is_ftp_login'
print(corr('ct_ftp_cmd', col), corr('is_ftp_login', 'label'))
catplot(total, col)
total.drop([col], axis=1, inplace=True)

# Integer Features
## ct_state_ttl
No. for each state according to specific range of values for source/destination time to live (sttl/dttl).

In [None]:
col = 'ct_state_ttl'
catplot(total, col)

## ct_ftp_cmd
No of flows that has a command in ftp session. It has a very low correlation with target. Also is_ftp_login is highly correlated with it (0.9988554882922012).

In [None]:
catplot(total, 'ct_ftp_cmd')
corr('ct_ftp_cmd', 'label')

## ct_flw_http_mthd
No. of flows that has methods such as Get and Post in http service. Seems 0 has more anomaly values, however the correlation is very small with target.

In [None]:
col = 'ct_flw_http_mthd'
catplot(total, col)
corr(col) # -0.012237160723

In [None]:
create_count_df(col, total)

## sbytes & dbytes
* sbytes: Source to destination transaction bytes 
* dbytes: Destination to source transaction bytes

These 2 features are higly corelated to number of packets sent (spkts & dpkts). Actually, spkts * smean = sbytes. Also they are closely related to sloss and dloss. So we can drop these 2 here.

In [None]:
print(find_corr(total['spkts']*total['smean'], total['sbytes'])) # 0.999999
print(find_corr(total['dpkts']*total['dmean'], total['dbytes'])) # 0.99999
print(corr('sbytes', 'sloss'), corr('dbytes', 'dloss')) # 0.995771577240429, 0.9967111338305503
total.drop(['sbytes', 'dbytes'], axis=1, inplace=True)

## smean & dmean 
Mean of the packet size transmitted. However is it just sbytes/spkts ? The correlation says it is. So we already have this 
info from those other features.

In [None]:
dual_plot('smean')

In [None]:
dual_plot('dmean')

In [None]:
total['smean_log1p'] = total['smean'].apply(np.log1p)
total['dmean_log1p'] = total['dmean'].apply(np.log1p)

# -0.02837244879012871 -0.2951728296856902 -0.05807468815031313 -0.5111549621216057
print(corr('smean'), corr('dmean'), corr('smean_log1p'), corr('dmean_log1p'))
# So we have better correlation with label after applying log1p. 
total.drop(['smean', 'dmean'], axis=1, inplace=True)

## spkts and dpkts
* spkts : Source to destination packet count 
* dpkts: Destination to source packet count

In [None]:
col = 'spkts'
dual_plot(col)

In [None]:
dual_plot(col, method=np.log1p)

In [None]:
total['spkts_log1p'] = total['spkts'].apply(np.log1p)
total['dpkts_log1p'] = total['dpkts'].apply(np.log1p)

# -0.043040466783819634 -0.09739388286233619 -0.3468819761209388 -0.45005074723539357
print(corr('spkts'), corr('dpkts'), corr('spkts_log1p'), corr('dpkts_log1p'))
# So we have better correlation with label after applying log1p. 
total.drop(['spkts', 'dpkts'], axis=1, inplace=True)

## sttl & dttl
* sttl: Source to destination time to live value 
* dttl: Destination to source time to live value

For sttl most of the anomalies have live values around 65 and 250. Its correlation with the target value is high too.
However, for dttl both types have nearly same distribution. So the correlation with target is very low.

In [None]:
col = 'sttl'
dual_plot(col) # 0.62408238, after applying log1p 0.61556952425

In [None]:
col = 'dttl'
dual_plot(col) # corr -0.09859087338578788

## sloss & dloss
* sloss: Source packets retransmitted or dropped 
* dloss: Destination packets retransmitted or dropped

Sloss is highly correlated with spkts and sbytes (more than .91). Similarly dloss is highly correlated with dpkts and dbytes. 
However, though packets sent is related loss of packets, this isn't quite linearly related like packet number and size. So we keep both for now.

Values are mostly between 0 to 3. Yet some values are more than several thousands.

In [None]:
dual_plot('sloss')

In [None]:
# So log1p makes it easier to differentiate
dual_plot('sloss', method=np.log1p)

In [None]:
total['sloss_log1p'] = total['sloss'].apply(np.log1p)
total['dloss_log1p'] = total['dloss'].apply(np.log1p)
# 0.001828274080103508 -0.07596097807462938 -0.3454351103223904 -0.3701913238787703
print(corr('sloss'), corr('dloss'), corr('sloss_log1p'), corr('dloss_log1p') )
total.drop(['sloss', 'dloss'], axis=1, inplace= True)

## swin & dwin
TCP window advertisement value. Except 0 and 255 other values(1-254) occur mostly once only. So we can separate them into 3 groups. And we also see after binning their correlation with target remains same.

In [None]:
total['swin'].value_counts().loc[lambda x: x>1]

In [None]:
total['dwin'].value_counts().loc[lambda x: x>1]

In [None]:
print(corr('swin'), corr('dwin'))

In [None]:
dual_plot('swin')

In [None]:
selected = ['swin', 'dwin']
kbins = preprocessing.KBinsDiscretizer(n_bins=[3, 3], encode='ordinal', strategy='uniform')
total[selected] = pd.DataFrame(kbins.fit_transform(total[selected]), columns=selected)
print(corr('swin'), corr('dwin'))

## stcpb & dtcpb
TCP base sequence number. It has a really big range, 0 to 5e9. However, anomaly connections are mostly around 0. 

In [None]:
col = 'stcpb'
dual_plot(col)

In [None]:
dual_plot(col, method=np.log1p)

In [None]:
total['stcpb_log1p'] = total['stcpb'].apply(np.log1p)
total['dtcpb_log1p'] = total['dtcpb'].apply(np.log1p)
# -0.2665849100492664 -0.2635428109654134 -0.33898970769021913 -0.33835676091281974
print(corr('stcpb'), corr('dtcpb'), corr('stcpb_log1p'), corr('dtcpb_log1p'))
total.drop(['stcpb', 'dtcpb'], axis=1, inplace= True)

### tcprtt & synack & ackdat
* tcprtt is the TCP connection setup round-trip time, the sum of ’synack’ and ’ackdat’.
* synack: TCP connection setup time, the time between the SYN and the SYN_ACK packets.
* ackdat : TCP connection setup time, the time between the SYN_ACK and the ACK packets.

As tcprtt, is just the sum of other two features, it doesn't add any extra info to our models. So we can just drop it for now.
Applying preprocessing on synack and ackdat didn't improve much. From graph we can see, anomaly connections generally have values around 0.

In [None]:
total.drop(['tcprtt'], axis=1, inplace=True)

In [None]:
dual_plot('synack')

In [None]:
dual_plot('ackdat')

## trans_depth
Represents the pipelined depth into the connection of http request/response transaction. After depth 5 to 172 occurences are few.

In [None]:
col = 'trans_depth'
print(corr(col)) # -0.0022256544
create_count_df(col, total)

## response_body_len
Actual uncompressed content size of the data transferred from the server’s http service. 
The values range between 0 to 5.24M.

In [None]:
col = 'response_body_len'
dual_plot(col)

In [None]:
total["response_body_len_log1p"] = total["response_body_len"].apply(np.log1p)

# slight improve
# -0.018930127454048158 -0.03261972203078345
print(corr('response_body_len'), corr('response_body_len_log1p'))
total.drop(['response_body_len'], axis=1, inplace=True)

## ct_srv_src
No. of connections that contain the same service and source address in 100 connections according to the last time. Most of the normal connections are within 10. It is highly correlated to ct_srv_dst.

In [None]:
col = 'ct_srv_src'
print(total[col].value_counts())

In [None]:
print(corr(col)) # 0.24659616767
dual_plot(col)

## ct_srv_dst
No. of connections that contain the same service and destination address in 100 connections according to the last time. It is highly correlated to ct_srv_src too. It has a slight better correlation with label than ct_srv_src. So the other one can be dropped to check for possible improvement.

In [None]:
col = 'ct_srv_dst'
print(total[col].value_counts())
# graph is same as ct_srv_src
dual_plot(col)

In [None]:
# 0.2478122357. they are very correlated 0.97946681, need to check whether dropping one benefits
print(corr('ct_srv_dst'), corr('ct_srv_src', 'ct_srv_dst'))

## ct_src_ltm & ct_dst_ltm
No. of connections of the same source/destination address in 100 connections according to the last recorder time.
Values are well between 0 to 51 and very few values after 48. They are much correlated , but not to the point of dropping one.

In [None]:
col = 'ct_src_ltm'
print(corr(col))
create_count_df(col, total)

In [None]:
print(corr('ct_dst_ltm'))
create_count_df('ct_dst_ltm', total)

In [None]:
corr('ct_src_ltm', 'ct_dst_ltm')

## ct_src_dport_ltm & ct_dst_sport_ltm
* ct_src_dport_ltm : No of connections of the same source address and the destination port in 100 connections according to the last time.
* ct_dst_sport_ltm: No of connections of the same destination address and the source port in 100 connections according to the last time.

In [None]:
for col in ['ct_src_dport_ltm', 'ct_dst_sport_ltm']:
    print(corr(col))
    print(create_count_df(col, total))

In [None]:
corr('ct_src_dport_ltm', 'ct_dst_sport_ltm')

# Decimal Features
## dur 
recorded total duration. Normal connections are mostly within 5. However, this feature has a poor correlation with label.


In [None]:
col = 'dur'
print(corr(col)) # 0.0290961170, correlation gets worse after log1p
dual_plot(col)

## rate
This feature isn't mentioned is feature list. It has value upto 1M. Anomaly connections are mostly around 0.

In [None]:
col = 'rate'
print(corr(col))
dual_plot(col) # cor 0.3358, after applying log1p it becomes 0.31581108

## sinpkt & dinpkt
* sinpkt: Source interpacket arrival time (mSec)
* dinpkt: Destination interpacket arrival time (mSec)

sinpkt is highly correlated with is_sm_ips_ports (0.9421206). Will dropping one of them benefit ?

In [None]:
col = 'sinpkt'
corr(col, 'is_sm_ips_ports')

In [None]:
print(corr(col)) # corr -0.1554536980863
dual_plot(col) 

In [None]:
dual_plot(col, method=np.log1p)

In [None]:
dual_plot('dinpkt')

In [None]:
total['sinpkt_log1p'] = total['sinpkt'].apply(np.log1p)
total['dinpkt_log1p'] = total['dinpkt'].apply(np.log1p)

# slight improve in correlation
# -0.1554536980867726 -0.030136042428744566 -0.16119699304378052 -0.07408113676641241
print(corr('sinpkt'), corr('dinpkt'), corr('sinpkt_log1p'), corr('dinpkt_log1p'))
total.drop(['sinpkt', 'dinpkt'], axis=1, inplace= True)

## sload & dload
* sload: Source bits per second
* dload: Destination bits per second

The values are really big and in bits.

In [None]:
dual_plot('sload')

In [None]:
dual_plot('dload')

In [None]:
total['sload_log1p'] = total['sload'].apply(np.log1p)
total['dload_log1p'] = total['dload'].apply(np.log1p)
# 0.16524867685764016 -0.35216880416636837 0.3397788822586144 -0.5919440288535992
print(corr('sload'), corr('dload'), corr('sload_log1p'), corr('dload_log1p'))
total.drop(['sload', 'dload'], axis=1, inplace=True)

## sjit & djit
Source and Destination jitter in mSec. Preprocessing didn't improve anything.

In [None]:
dual_plot('sjit')

In [None]:
dual_plot('djit')

# Output

In [None]:
features.to_csv('features.csv', index=False)
train = total[total['type']=='train'].drop(['type'], axis=1)
test = total[total['type']!='train'].drop(['type'], axis=1)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)