In [151]:
import numpy as np
import pandas as pd

## 1. Introduce the Data

### Import the dataset

In [152]:
df = pd.read_csv('20061101.txt', sep = "\t", header = None)
column_list = [
    'duration',
    'service',
    'source_bytes',
    'destination_bytes',
    'count',
    'same_srv_rate',
    'serror_rate',
    'srv_serror_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_src_port_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'flag',
    'ids_detection',
    'malware_detection',
    'ashula_detection',
    'label',
    'source_ip_address',
    'source_port_number',
    'destination_ip_address',
    'destination_port_number',
    'start_time',
    'protocol'
]
df.columns = column_list

In [153]:
df.head()

Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,...,ids_detection,malware_detection,ashula_detection,label,source_ip_address,source_port_number,destination_ip_address,destination_port_number,start_time,protocol
0,27.561208,smtp,3179,175,0,0.0,0.0,0.0,0,0,...,0,0,0,1,fda2:69aa:1f1a:84b0:130d:2736:3fa0:42da,2161,fda2:69aa:1f1a:61a4:7dc5:27f2:0713:0f0e,25,00:00:09,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,...,0,0,0,-1,fda2:69aa:1f1a:0104:3fff:571a:ff2c:00a5,138,fda2:69aa:1f1a:2108:3f84:570e:ffe4:007b,138,00:00:14,udp
2,86366.249616,other,244776,0,0,0.0,0.0,0.0,0,0,...,0,0,0,-1,fda2:69aa:1f1a:540c:7d80:2750:07a6:28a5,32770,fda2:69aa:1f1a:0d61:1001:01e2:02fb:2a22,8649,00:00:15,udp
3,2994.374758,other,15744,18154,0,0.0,0.0,0.0,0,0,...,0,0,0,-1,fda2:69aa:1f1a:3aef:7af3:3027:3045:7ff2,1400,fda2:69aa:1f1a:e714:277f:10e1:03f2:425a,80,00:00:16,tcp
4,4.749378,smtp,7895,244,0,0.0,0.0,0.0,0,0,...,0,0,0,1,fda2:69aa:1f1a:381e:25aa:0bff:12e8:0365,1806,fda2:69aa:1f1a:61a4:7dc5:27f2:0713:0f0e,25,00:00:17,tcp


In [154]:
df.shape

(9649, 24)

### Remove the target variable from the dataset
the target variable will be a combination of `label`, `ids_detection`, `malware_detection`, `ashula_detection`

- label = 0 if **no** Intrusion 1 Otherwise **<-- Important**
- ids_detection = 0 if alert **not** triggered 1 otherwise
- malware_detection = 0 if alert **not** triggered 1 otherwise
- ashula_detection = 0 if alert **not** triggered 1 otherwise

In [155]:
df['label'] = [0 if x == 1 else 1 for x in df['label']]
df['ids_detection'] = [0 if x == 0 else 1 for x in df['label']]
df['malware_detection'] = [0 if x == 0 else 1 for x in df['label']]
df['ashula_detection'] = [0 if x == 0 else 1 for x in df['label']]

In [156]:
label_target = df.pop('label').values
ids_detection_target = df.pop('ids_detection').values
malware_detection_target = df.pop('malware_detection').values
ashula_detection_target = df.pop('ashula_detection').values

df.shape

(9649, 20)

In [157]:
#applying OR to get the target variable y
y = label_target | ids_detection_target | malware_detection_target | ashula_detection_target

### Checking String Based Features

In [158]:
#count the unique values in service feature(string based)
service_value_counts = df['service'].value_counts()
print("Number of unique values = ", service_value_counts.shape[0], "\n")
print(service_value_counts)

Number of unique values =  9 

other       6046
http        2186
smtp         677
dns          327
ssh          280
ssl          122
ftp            5
smtp,ssl       3
ftp-data       3
Name: service, dtype: int64


In [159]:
#count the unique values in protocol feature(string based)
protocol_value_counts = df['protocol'].value_counts()
print("Number of unique values = ", protocol_value_counts.shape[0], "\n")
print(protocol_value_counts)

Number of unique values =  3 

tcp     6934
udp     1655
icmp    1060
Name: protocol, dtype: int64


In [160]:
#count the unique values in flag feature(string based)
protocol_value_counts = df['flag'].value_counts()
print("Number of unique values = ", protocol_value_counts.shape[0], "\n")
print(protocol_value_counts)

Number of unique values =  13 

OTH       2914
SF        2747
S0        1620
RSTO      1324
REJ        444
RSTR       201
SH         145
S1          78
RSTOS0      64
SHR         55
RSTRH       38
S3          18
S2           1
Name: flag, dtype: int64


### Unique values for each feature in the dataset

In [161]:
df.T.apply(lambda x: x.nunique(), axis=1)

duration                       6429
service                           9
source_bytes                   1604
destination_bytes              1638
count                            28
same_srv_rate                    29
serror_rate                      10
srv_serror_rate                  76
dst_host_count                  101
dst_host_srv_count              101
dst_host_same_src_port_rate      37
dst_host_serror_rate              4
dst_host_srv_serror_rate          4
flag                             13
source_ip_address               887
source_port_number             3196
destination_ip_address          486
destination_port_number         137
start_time                     6311
protocol                          3
dtype: int64

### Removing unnecessary features
Everything from source_ip_address to start_time is of no use because these things are really random...

In [162]:
df.pop('source_ip_address').values
df.pop('source_port_number').values
df.pop('destination_ip_address').values
df.pop('destination_port_number').values
df.pop('start_time').values
df.head()

Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,flag,protocol
0,27.561208,smtp,3179,175,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SF,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,udp
2,86366.249616,other,244776,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,udp
3,2994.374758,other,15744,18154,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,RSTOS0,tcp
4,4.749378,smtp,7895,244,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SF,tcp


### Features to use

In [163]:
list(df)

['duration',
 'service',
 'source_bytes',
 'destination_bytes',
 'count',
 'same_srv_rate',
 'serror_rate',
 'srv_serror_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_src_port_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'flag',
 'protocol']

### Transfrom Catergorical Data to Numerical Data

In [164]:
from sklearn import preprocessing

In [165]:
#todo: make this run in a loop
categorical_data = ['service', 'flag', 'protocol']
unique_flag_data = df['flag'].unique()
unique_service_data = df['service'].unique()
unique_protocol_data = df['protocol'].unique()

#### Encoder for feature : Flag

In [166]:
le_flag = preprocessing.LabelEncoder()
#Fit the label encoder to unique values
le_flag.fit(unique_flag_data)

#Fit the label data to some example data
example_flag_data = list(df.head()['flag'])
#Fit the label encoder and return encoded labels
encoded_flag_data = le_flag.transform(example_flag_data)

#Transform labels back to original encoding
decoded_flag_data = list(le_flag.inverse_transform(encoded_flag_data))

print(example_flag_data)
print(encoded_flag_data)
print(decoded_flag_data)
#Ignore any warnings

['SF', 'S0', 'S0', 'RSTOS0', 'SF']
[10  6  6  3 10]
['SF', 'S0', 'S0', 'RSTOS0', 'SF']


  if diff:


#### Encoder for feature : service and protocol

In [167]:
le_service = preprocessing.LabelEncoder()
le_service.fit(unique_service_data)

le_protocol = preprocessing.LabelEncoder()
le_protocol.fit(unique_protocol_data)

LabelEncoder()

In [168]:
df['flag'] = le_flag.transform(df['flag'])
df['service'] = le_service.transform(df['service'])
df['protocol'] = le_protocol.transform(df['protocol'])
df.head()

Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,flag,protocol
0,27.561208,5,3179,175,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,10,1
1,0.0,4,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,6,2
2,86366.249616,4,244776,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,6,2
3,2994.374758,4,15744,18154,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,3,1
4,4.749378,5,7895,244,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,10,1


### PCA

In [169]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X_pca = pd.DataFrame(pca.fit_transform(df))

In [170]:
X_pca.head()

Unnamed: 0,0,1
0,-6008.096788,1896.655989
1,-6268.392815,-1276.607341
2,323.075561,243925.283539
3,12302.448013,13991.979072
4,-5812.507031,6608.881123


In [171]:
def xnor(ar1, ar2):
    ar3 = np.array([])
    for i in range(0,ar1.shape[0]):
        if(ar1[i] == ar2[i]):
            res = 1
        else: res = 0
        ar3 = np.append(ar3, res)
    return ar3.astype(int)

In [172]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2, random_state = 0).fit(X_pca)
print('correct : ',np.count_nonzero(xnor(kmeans.labels_, y)))
print(kmeans.cluster_centers_)

correct :  1063
[[-1.91734729e+03  3.47752292e+01]
 [ 1.08634642e+06 -1.97032358e+04]]


In [173]:
km = KMeans(n_clusters = 2, random_state = 0).fit(df)
print('correct : ',np.count_nonzero(xnor(kmeans.labels_, y)))
print(km.cluster_centers_)

correct :  1063
[[ 3.97929797e+01  3.84509967e+00  1.42752253e+03  4.31426537e+03
   1.00197259e+00  2.25241902e-01  2.04516196e-02  6.75166113e-02
   1.95101744e+01  2.52756437e+01  2.05276163e-01  3.62572674e-02
   6.10818106e-02  4.60350914e+00  1.06177326e+00]
 [ 1.64987982e+02  3.29411765e+00  1.09157647e+04  1.09271565e+06
   8.23529412e-01  4.70588235e-01  3.46944695e-18  1.38777878e-17
   1.70588235e+01  1.70588235e+01  0.00000000e+00 -6.93889390e-18
  -1.38777878e-17  5.76470588e+00  1.00000000e+00]]


### Using regression

In [174]:
# Use train_test_split in sklearn.cross_validation to split data into train and test sets
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.70, random_state=1)

In [175]:
# Function to build model and find model performance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

def find_model_perf(X_train, y_train, X_test, y_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_hat = [x[1] for x in model.predict_proba(X_test)]
    auc = roc_auc_score(y_test, y_hat)
    
    return auc

In [176]:
# Find performance of model using preprocessed data
auc_processed = find_model_perf(X_train, y_train, X_test, y_test)
print(auc_processed)

0.9328300970873785
