In [1]:
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os.path

In [2]:
features_path = "./NUSW-NB15_features.csv"
print("Features file found" if os.path.isfile("./NUSW-NB15_features.csv") else "")

Features file found


In [3]:
#quick peek at the features and cleaning up column names for easier indexing
features_df = pd.read_csv(features_path, encoding="latin-1")
for i in range(len(features_df.columns.values)):
    features_df.columns.values[i] = str(features_df.columns.values[i]).strip().lower()
    
    
print(features_df.columns) #cleaned up column names

#lower case all the types
for i in range(len(features_df)):
    features_df.loc[i, ['type']] = str(features_df['type'][i]).strip().lower()
    features_df.loc[i, ['name']] = str(features_df['name'][i]).strip().lower()

print(features_df[['name', 'type']])

Index(['no.', 'name', 'type', 'description'], dtype='object')
                name       type
0              srcip    nominal
1              sport    integer
2              dstip    nominal
3             dsport    integer
4              proto    nominal
5              state    nominal
6                dur      float
7             sbytes    integer
8             dbytes    integer
9               sttl    integer
10              dttl    integer
11             sloss    integer
12             dloss    integer
13           service    nominal
14             sload      float
15             dload      float
16             spkts    integer
17             dpkts    integer
18              swin    integer
19              dwin    integer
20             stcpb    integer
21             dtcpb    integer
22           smeansz    integer
23           dmeansz    integer
24       trans_depth    integer
25       res_bdy_len    integer
26              sjit      float
27              djit      float
28        

In [4]:
#quick peek at the data
training_set_path = "./a part of training and testing set/UNSW_NB15_training-set.csv"
training_df = pd.read_csv(training_set_path, encoding="latin-1")
print(training_df[:1])
#Of COURSE this file is organized differently than the features file describes.
#Why would I expect differently?

   id       dur proto service state  spkts  dpkts  sbytes  dbytes        rate  \
0   1  0.000011   udp       -   INT      2      0     496       0  90909.0902   

   ...  ct_dst_sport_ltm  ct_dst_src_ltm  is_ftp_login  ct_ftp_cmd  \
0  ...                 1               2             0           0   

   ct_flw_http_mthd  ct_src_ltm  ct_srv_dst  is_sm_ips_ports  attack_cat  \
0                 0           1           2                0      Normal   

   label  
0      0  

[1 rows x 45 columns]


In [5]:
#so we'll use a file from the "full" dataset instead
nb15_1_path = "./UNSW-NB15_1.csv"
#dtypes = {"sport": "int32", "dsport": "int32"}
#packet_data_df = pd.read_csv(nb15_1_path, encoding="latin-1", names=features_df['name'], header=None, dtype=dtypes)
packet_data_df = pd.read_csv(nb15_1_path, encoding="latin-1", names=features_df['name'], header=None)
print(packet_data_df.columns)
#TODO FIXME fix the data types on the other columns (see warning)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'label'],
      dtype='object')


In [6]:
#for each feature of type "nominal" or "integer" count how many classes exist
#print(packet_data_df['label'].unique()) #identify the different values

for label, feature_type in features_df[['name', 'type']].values:
    nunique = packet_data_df[packet_data_df['label'] == 1][label].nunique()
    if nunique < 10:
        value_list = packet_data_df[packet_data_df['label'] == 1][label].unique().tolist()
        print(label + ": " , end='')
        print(value_list, end='')
        print(" type: " + str(feature_type))
    else:
        print(label + ": " + str(nunique) + " type: " + str(feature_type))
        
        
#how many attack packets do we have compared to non-attack packets?
print("Normal packets: ", len(packet_data_df[packet_data_df['label'] == 0].index))
print("Attack packets: ", len(packet_data_df[packet_data_df['label'] == 1].index))

srcip: ['175.45.176.3', '175.45.176.2', '175.45.176.0', '175.45.176.1'] type: nominal
sport: 9983 type: integer
dstip: 10 type: nominal
dsport: 827 type: integer
proto: 129 type: nominal
state: ['INT', 'FIN', 'CON', 'REQ', 'CLO', 'ACC'] type: nominal
dur: 8748 type: float
sbytes: 2604 type: integer
dbytes: 2410 type: integer
sttl: [254, 62, 255, 0, 63] type: integer
dttl: [0, 252, 60, 253] type: integer
sloss: 186 type: integer
dloss: 200 type: integer
service: 13 type: nominal
sload: 9501 type: float
dload: 8654 type: float
spkts: 225 type: integer
dpkts: 245 type: integer
swin: [0, 255] type: integer
dwin: [0, 255] type: integer
stcpb: 8584 type: integer
dtcpb: 8552 type: integer
smeansz: 1072 type: integer
dmeansz: 986 type: integer
trans_depth: [0, 1, 2, 3, 4, 8] type: integer
res_bdy_len: 492 type: integer
sjit: 8678 type: float
djit: 8552 type: float
stime: 6490 type: timestamp
ltime: 6081 type: timestamp
sintpkt: 8719 type: float
dintpkt: 8581 type: float
tcprtt: 8247 type: floa

In [60]:
def to_integer(value):
    try:
        temp = int(value,10)
        return temp
    except ValueError:
        temp = int(value,16)
        return temp

In [127]:
packet_data_df.drop(packet_data_df.loc[packet_data_df['sport']=='-'].index, inplace=True)

In [128]:
sports1 = packet_data_df['sport']

In [133]:
sports1.transform(lambda x: to_integer(str(x)))

0          1390
1         33661
2          1464
3          3593
4         49664
          ...  
699996    12520
699997    18895
699998    30103
699999    30388
700000     6055
Name: sport, Length: 699999, dtype: int64

In [134]:
# what is maximum port value
attack_sports = packet_data_df[packet_data_df['label']==1]['sport']
#sorted_ports = attack_sports.sort_values(ascending=False)

In [135]:
attack_sports.transform(lambda x: to_integer(str(x)))

20        21223
21        23357
22        13284
39        13792
40        26939
          ...  
186498    58463
186499    58460
186562    58967
186658    58485
186787    58935
Name: sport, Length: 22215, dtype: int64

In [136]:
#here we convert the columns to the appropriate type, should just load them with the right
#data type to start
for k in range(0, packet_data_df['sport'].shape[0]):
    if (isinstance(packet_data_df.loc[k, 'sport'], str)):
        packet_data_df.loc[k, 'sport'] = int(packet_data_df.loc[k, 'sport'])
        
#did we convert all the strings?
for k in range(0, packet_data_df['sport'].shape[0]):
    if (isinstance(packet_data_df.loc[k, 'sport'], str)):
        print(packet_data_df.loc[k, 'sport'])

KeyboardInterrupt: 

In [None]:
#How can we encode these various features, many of which are discrete integers?
#One-hot or Binary encoding seems logical, using Binary coding to keep things compact.

#Returns a list where each element are a 1 or 0, determining the binary encoding of value with
#at least bits number of bits. If the value cannot be encoding with the requested number of bits,
#None will be returned.
def binary_encode(value, bits):
    encoding = []
    while value != 0:
        encoding.append(value % 2)
        value //= 2
        
    if bits < len(encoding):
        return None #couldn't represent with requested number of bits
    
    while len(encoding) < bits:
        encoding.append(0)
    
    encoding.reverse()
    return encoding

#Takes binary integer in the form of a list containing 1's and 0's. 
#Returns the base-10 (integer) representation of the binary value.
def binary_decode(value):
    if len(value) == 0:
        return None
    
    out = 0
    for i in range(0, len(value)):
        if value[i] == 1:
            out += 2**(len(value) - (i+1))
            
    return out

def float_to_binary(value):
    out = []
    for i in range(len(value)):
        if value[i] >= 0.5:
            out.append(1)
        else:
            out.append(0)
            
    return out
        
print(binary_encode(7, 4)) #returns [0,1,1,1]
print(binary_encode(255, 2)) #returns None

print(binary_decode([0,1,1,1])) #returns 7
print(binary_decode([1,1,0,1])) #returns 13
print(binary_decode(float_to_binary([0.55, 0.98, 0.34, 0.6]))) #returns 13

print("Inversion test: ")
for i in range(0,16):
    print(binary_decode(binary_encode(i, 4)))

In [20]:
#How can we encode these various features, many of which are discrete integers?
#One-hot or Binary encoding seems logical, using Binary coding to keep things compact.

#Returns a list where each element are a 1 or 0, determining the binary encoding of value with
#at least bits number of bits. If the value cannot be encoding with the requested number of bits,
#None will be returned.
def binary_encode(value, bits):
    encoding = []
    while value != 0:
        encoding.append(value % 2)
        value //= 2
        
    if bits < len(encoding):
        return None #couldn't represent with requested number of bits
    
    while len(encoding) < bits:
        encoding.append(0)
    
    encoding.reverse()
    return encoding

print(binary_encode(7,4))
print(binary_encode(255,2))

[0, 1, 1, 1]
None


In [22]:
ip_as_bits = []
for byte in packet_data_df['srcip'][0].split('.'):
    ip_as_bits += binary_encode(int(byte),8)
    
print(ip_as_bits)

ValueError: invalid literal for int() with base 10: 'ï»¿59'

In [23]:
# lets see what some of other relevant fields look like
timestamp = packet_data_df['stime'][0]
print(timestamp)
nbits=36
print(binary_encode(timestamp,nbits))
for k in packet_data_df['stime']:
    if binary_encode(k, nbits) is None:
        print("Couldnt map all the timestamp")
        break

1421927414
[0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0]


In [24]:
# what features do I care about
# all the non-aggregate features that are some combination of the other features
# either directly in an example or temporal combinations since these should ostensibly
# be discovered by the GAN
features_to_use = features_df[:39]
print(features_to_use[['name','type']])

                name       type
0              srcip    nominal
1              sport    integer
2              dstip    nominal
3             dsport    integer
4              proto    nominal
5              state    nominal
6                dur      float
7             sbytes    integer
8             dbytes    integer
9               sttl    integer
10              dttl    integer
11             sloss    integer
12             dloss    integer
13           service    nominal
14             sload      float
15             dload      float
16             spkts    integer
17             dpkts    integer
18              swin    integer
19              dwin    integer
20             stcpb    integer
21             dtcpb    integer
22           smeansz    integer
23           dmeansz    integer
24       trans_depth    integer
25       res_bdy_len    integer
26              sjit      float
27              djit      float
28             stime  timestamp
29             ltime  timestamp
30      

# Feature Encoding

I will encode the integer based features using binary representation,
 using the minimum number of bits to represent the max value plus one bit
 Float based parameters will be scaled in a typical manner
IP addresses in particular are a special case, since each field is represending a collection of 4 bytes. These addresses will be represented as 32 bits, since this is the native representation and seems appropriate for this task.

Other categorical values could be binary or one-hot encoded.

In [26]:
def get_minimum_bits(value):
    min_bits = 1
    while binary_encode(value, min_bits) is None:
        min_bits += 1
        
    return min_bits

def build_input_feature_tensor(packet_data_dict):
    input_features = []
    
    srcip_segments = str(packet_data_dict['srcip']).split('.')
    srcip_bits = []
    for segment in srcip_segments:
        for k in binary_encode(int(segment), 8):
            srcip_bits.append(k)
    
    dstip_segments = str(packet_data_dict['dstip']).split('.')
    dstip_bits = []
    for segment in dstip_segments:
        for k in binary_encode(int(segment), 8):
            dstip_bits.append(k)
            
    sport = binary_encode(int(packet_data_dict['sport']), 16)
    # get_minimum_bits(int(packet_data_dict['sport']))+1
    dport = binary_encode(int(packet_data_dict['dsport']), 16)
    # get_minimum_bits(int(packet_data_dict['dport']))+1
    
    #TODO need to encode the rest of the features buuuuuttttt that can come later.
    
    input_features += srcip_bits + dstip_bits + sport + dport
    return torch.tensor(input_features, dtype=torch.float32)
    #return torch.tensor(input_features, dtype=torch.float64)

In [27]:
X = build_input_feature_tensor(packet_data_df.loc[0,:].to_dict())
print(X)
print(X.shape)
X_seq = torch.tensor(()).new_zeros([5,1,X.shape[0]])
print(X_seq.shape)

X_seq[:,0,:] = X
print(X_seq)

ValueError: invalid literal for int() with base 10: 'ï»¿59'

In [None]:
# I shouldnt actually do the training in this notebook, this is mostly a test to see if I ve prepared  
# the features correctly for input to some RNN network 
# MODELS: Define Generator model and Discriminator model
# one-layer RNN that is the same width as the input feat tensor
        
#Revert a feature tensor to human readable form
#This working correctly is heavily dependent on sizes and locations chosen in 
#build_input_feature_tensor()
def decode_feature_tensor(feature_tensor):
    output_values = {}
    
    srcip_segments = []
    for i in [0,1,2,3]:
        srcip_segments.append(binary_decode(float_to_binary(feature_tensor[i*8:(i*8)+8])))
        
    srcip_string = ".".join([str(k) for k in srcip_segments])
    
    dstip_segments = []
    for i in [4,5,6,7]:
        dstip_segments.append(binary_decode(float_to_binary(feature_tensor[i*8:(i*8)+8])))
        
    dstip_string = ".".join([str(k) for k in dstip_segments])
    
    sport = binary_decode(float_to_binary(feature_tensor[64:64+16]))
    dport = binary_decode(float_to_binary(feature_tensor[64+16:64+16+16]))
    