# IDS - Self-Organizing maps

In [1]:
# This is the visualization and implementation part of the IDS, using SOM 

In [2]:
import numpy as np
from minisom import MiniSom
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', None)

  from ._conv import register_converters as _register_converters


## Minisom tutorials

In [3]:
data = [[ 0.80,  0.55,  0.22,  0.03],
        [ 0.82,  0.50,  0.23,  0.03],
        [ 0.80,  0.54,  0.22,  0.03],
        [ 0.80,  0.53,  0.26,  0.03],
        [ 0.79,  0.56,  0.22,  0.03],
        [ 0.75,  0.60,  0.25,  0.03],
        [ 0.77,  0.59,  0.22,  0.03]]     

In [4]:
som = MiniSom(6, 6, 4, sigma=0.3, learning_rate=0.5) # initialization of 6x6 SOM
print("Training...")
som.train_random(data, 100) # trains the SOM with 100 iterations
print("...ready!")

Training...
...ready!


In [5]:
som.activation_response(np.array(data))

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 7., 0., 0.]])

# Analysis of the dataset

In [6]:
df_columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome'
]

In [7]:
df = pd.read_csv("./data/kddcup.data_10_percent_corrected.csv")
df.dropna(inplace=True,axis=1)
df.columns = df_columns

In [8]:
len(df.columns)

42

In [9]:
df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0
mean,47.9794,3025.616,868.5232,4.5e-05,0.006433,1.4e-05,0.034519,0.000152,0.148245,0.010212,0.000111,3.6e-05,0.011352,0.001083,0.000109,0.001008,0.0,0.0,0.001387,332.286347,292.907133,0.176687,0.176609,0.057434,0.057719,0.791547,0.020982,0.028997,232.47123,188.666034,0.753779,0.030906,0.601936,0.006684,0.176754,0.176443,0.058118,0.057412
std,707.747185,988219.1,33040.03,0.006673,0.134805,0.00551,0.782103,0.01552,0.355343,1.798328,0.010551,0.007793,2.01272,0.096416,0.01102,0.036482,0.0,0.0,0.037211,213.147129,246.322733,0.380717,0.381017,0.231624,0.232147,0.38819,0.082206,0.142398,64.744665,106.040236,0.410781,0.109259,0.481309,0.042133,0.380593,0.38092,0.23059,0.230141
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,46.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,510.0,510.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,511.0,511.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,693375600.0,5155468.0,1.0,3.0,3.0,30.0,5.0,1.0,884.0,1.0,2.0,993.0,28.0,2.0,8.0,0.0,0.0,1.0,511.0,511.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [10]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])

In [11]:
numeric_df.dtypes

duration                         int64
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate                float64
rerror_rate                    float64
srv_rerror_rate                float64
same_srv_rate            

In [12]:
def df_to_tf_matrix(df):
    result = []
    for x in df.columns:
        result.append(x)
    return df.as_matrix(result).astype(np.float64)

In [13]:
train_data = df_to_tf_matrix(numeric_df)

In [14]:
som = MiniSom(20, 20, len(numeric_df.columns), sigma=0.3, learning_rate=0.5) # initialization of 6x6 SOM
print("Training...")
som.train_random(train_data, 1000) # trains the SOM with 100 iterations
print("...ready!")

Training...
...ready!


In [15]:
target = np.genfromtxt('./data/kddcup.data_10_percent_corrected.csv', delimiter=',', usecols=(41), dtype=str)

In [16]:
target_df = pd.DataFrame(target)
target_df.columns = ['label']
target_df.apply(pd.value_counts)

Unnamed: 0,label
smurf.,280790
neptune.,107201
normal.,97278
back.,2203
satan.,1589
ipsweep.,1247
portsweep.,1040
warezclient.,1020
teardrop.,979
pod.,264


In [17]:
target = df['outcome'].as_matrix()

In [18]:
np.shape(target)

(494020,)

In [19]:
target = np.reshape(target, (len(target), 1))

In [20]:
target[3]

array(['normal.'], dtype=object)

In [21]:
train_data_and_label = np.hstack((train_data, target))

In [22]:
list_train_data = train_data_and_label.tolist()

In [23]:
list_train_data[0][-1]

'normal.'

In [24]:
def map_winner_neurons(som, data, target_index=-1):
    vals = []
    mappings_list = np.zeros(np.shape(som.get_weights())[:2]).tolist()
    for i in range(len(mappings_list)):
        for j in range(len(mappings_list[i])):
            mappings_list[i][j] = {}
    for val in data:
        winner = som.winner(val[:target_index])
        target_dict = mappings_list[winner[0]][winner[1]]
        target = val[target_index]
        if target not in target_dict:
            target_dict[target] = 1
        else:
            target_dict[target] += 1
    return mappings_list

%time winners_per_node = map_winner_neurons(som, list_train_data)

CPU times: user 6min 14s, sys: 122 ms, total: 6min 14s
Wall time: 6min 15s


In [25]:
winners_per_node

[[{'normal.': 9, 'smurf.': 9},
  {},
  {'smurf.': 193093},
  {'smurf.': 5},
  {'smurf.': 46},
  {},
  {'smurf.': 1},
  {'smurf.': 26185},
  {},
  {'smurf.': 548},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {}],
 [{'ipsweep.': 3, 'normal.': 11, 'smurf.': 12294},
  {'normal.': 1, 'smurf.': 5},
  {},
  {},
  {},
  {},
  {'normal.': 29, 'teardrop.': 29},
  {'smurf.': 71},
  {'smurf.': 2},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {}],
 [{'ipsweep.': 2, 'normal.': 3, 'smurf.': 6407},
  {'land.': 1, 'neptune.': 7, 'normal.': 131, 'teardrop.': 16},
  {'neptune.': 5012},
  {'neptune.': 6558},
  {'neptune.': 6437, 'portsweep.': 5, 'satan.': 6},
  {'neptune.': 1, 'satan.': 5, 'teardrop.': 212},
  {'ipsweep.': 77,
   'land.': 1,
   'neptune.': 1406,
   'nmap.': 102,
   'normal.': 230,
   'portsweep.': 862,
   'rootkit.': 1,
   'satan.': 122,
   'teardrop.': 51},
  {'nmap.': 2,
   'normal.': 1763,
   'satan.': 1,
   'teardrop.': 8,
   'warezclient.': 3},
  {'normal

In [27]:
def get_keys_sum(d, keys):
    s = 0
    for k in keys:
        s += d[k]
    return s

def get_malicious_or_not(d):
    mal_keys = list(filter(lambda k: k != 'normal.', d.keys()))
    mal_sum = get_keys_sum(d, mal_keys)
    normal_sum = 0
    if 'normal.' in d.keys():
        normal_sum = d['normal.']
    if mal_sum == 0 or normal_sum / mal_sum > 0.5:
        return 'normal.'
    else:
        return 'malicious.'
    

def decide_if_neurons_malicious(winner_cols):
    res = []
    for i in range(len(winner_cols)):
        res.append([])
        for j in range(len(winner_cols[i])):
            d = winner_cols[i][j]
            if len(d) == 0:
                res[i].append('normal.')
            else:
                key = max(d, key=d.get)
                res[i].append(get_malicious_or_not(d))
    return res
                
            
            
malicious_neurons = decide_if_neurons_malicious(winners_per_node)

In [30]:
malicious_neurons

[['normal.',
  'normal.',
  'malicious.',
  'malicious.',
  'malicious.',
  'normal.',
  'malicious.',
  'malicious.',
  'normal.',
  'malicious.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.'],
 ['malicious.',
  'malicious.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'malicious.',
  'malicious.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.'],
 ['malicious.',
  'normal.',
  'malicious.',
  'malicious.',
  'malicious.',
  'malicious.',
  'malicious.',
  'normal.',
  'normal.',
  'malicious.',
  'malicious.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.',
  'normal.'],
 ['malicious.',
  'normal.',
  'malicious.',
  'malicious.',
  'malicious.',
  'malicious.',
  'malicious.',
  'normal.',
  'malicious.',
  'normal.',
  'malicious

In [31]:
lim = len(train_data_and_label)
hits = 0
tp = 0
tn = 0
fp = 0
fn = 0
for td in train_data_and_label:
    label = 'normal.' if td[-1] == 'normal.' else 'malicious.'
    winner = som.winner(td[:-1])
    pred = malicious_neurons[winner[0]][winner[1]]
    if pred == label:
        if pred == 'normal.':
            tn += 1
        else:
            tp += 1
        hits += 1
    else:
        if pred == 'normal.':
            fn += 1
        else:
            fp += 1
        
    

print(hits)

print("Acc: {}".format(hits / float(lim)))

489693
Acc: 0.9912412452937128


In [32]:
tp

393189

In [36]:
fp / lim

0.0015647139791911259

In [35]:
fn / lim

0.007194040727096069