# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import \
precision_score, recall_score, \
f1_score, roc_auc_score, \
confusion_matrix

import math
import os
import sys
import functools
import time
from collections import Counter

# Utils

In [2]:
T = time.time
START = -1
def t(*args):
    """
    Usage:
    t()
    long_func(x, y, z)
    t('Finished long_func')
    """
    global START
    if args == ():
        START = T()
    else:
        msg = args[0]
        elapsed = T() - START
        print(f'{msg} -> {elapsed} secs')

def show_scores(y_true, y_pred):
    
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    print(f'Precision: {prec*100:.3f} %')
    print(f'Recall: {rec*100:.3f} %')
    print(f'F1: {f1*100:.3f} %')
    print(f'AUC: {auc*100:.3f} %')

    cfmx = confusion_matrix(y_true, y_pred)
    #NOTE: ravel trick from -> https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
    tn, fp, fn, tp = cfmx.ravel()
    tpr = rec
    fpr = fp / (fp + tn)
    print(f'TPR-FPR: {(tpr-fpr)*100:.3f} %')
    print('Confusion Matrix:', '\n', cfmx[0], '\n', cfmx[1])

def sample_binetflow(TARG_FILE, n):
    data = []
    on_header = True
    count = 0
    for idx, raw in enumerate(open(TARG_FILE)):
        line = raw.strip('\n')
        if idx == 0 or idx % n == 0:
            data.append(line)
    return data

# Params + Read file

In [3]:
TARG_DIR = os.path.join(os.path.expanduser("~/Desktop"), 'repo/msci-res-repo/datasets/ctu-13')
TARG_NAME = 'capture20110815.binetflow'
TARG_FILE = os.path.join(TARG_DIR, TARG_NAME)
SAMPLE = 100

t()
df = pd.read_csv(TARG_FILE, skiprows=lambda i: i%SAMPLE)
t(f'Read "{TARG_NAME}"')
df

Read "capture20110815.binetflow" -> 0.7100062370300293 secs


Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/15 11:00:06.188739,1467.877441,tcp,66.220.151.72,443,<?>,147.32.84.59,51410,FRPA_RPA,0.0,0.0,96,48844,12390,flow=Background-Established-cmpgw-CVUT
1,2011/08/15 11:00:07.009196,0.000378,udp,147.32.84.68,56131,<->,147.32.80.9,53,CON,0.0,0.0,2,278,78,flow=To-Background-UDP-CVUT-DNS-Server
2,2011/08/15 11:00:07.551286,3.162136,tcp,194.79.52.99,80,<?>,147.32.84.59,49913,FA_RA,0.0,0.0,3,180,60,flow=Background-Established-cmpgw-CVUT
3,2011/08/15 11:00:08.265749,0.000236,udp,147.32.85.34,59873,<->,147.32.80.9,53,CON,0.0,0.0,2,189,68,flow=To-Background-UDP-CVUT-DNS-Server
4,2011/08/15 11:00:08.776219,3540.673584,tcp,95.189.14.209,4239,<?>,147.32.84.229,13363,PA_PA,0.0,0.0,738,125232,98459,flow=Background
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11205,2011/08/15 15:11:12.287455,0.000349,udp,147.32.85.60,57551,<->,147.32.80.9,53,CON,0.0,0.0,2,542,75,flow=To-Background-UDP-CVUT-DNS-Server
11206,2011/08/15 15:11:13.517021,0.000309,udp,147.32.85.89,39526,<->,147.32.80.9,53,CON,0.0,0.0,2,204,77,flow=To-Background-UDP-CVUT-DNS-Server
11207,2011/08/15 15:11:15.442482,0.000234,udp,147.32.84.138,44134,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,flow=To-Background-UDP-CVUT-DNS-Server
11208,2011/08/15 15:11:16.610585,0.000223,udp,147.32.85.25,54088,<->,147.32.80.9,53,CON,0.0,0.0,2,207,66,flow=To-Background-UDP-CVUT-DNS-Server


# Convert edge-based data to node-based data with corresp ground truth

In [4]:
'''
Partition flows into anomaly/normal
Label any IP involved in botnet traffic as anomaly IP node, normal otherwise
Create new dataframe to store "ground truth" data, node-based instead of edge-based
Now create node df features based on node's interactions in edge df
'''

df['Anomaly'] = df.Label.apply(lambda x : 'Botnet' in x)
normals = df[df['Anomaly'] == False]
anomalies = df[df['Anomaly'] == True]

anomaly_ips = np.unique(np.concatenate((anomalies.SrcAddr, anomalies.DstAddr)))
normal_ips = np.unique(np.concatenate((normals.SrcAddr, normals.DstAddr)))
normal_ips = np.array(list(ip for ip in normal_ips if ip not in anomaly_ips))

ip = pd.DataFrame()
ips = np.concatenate((anomaly_ips, normal_ips))
ip_labels = [*[1 for _ in anomaly_ips], *[0 for _ in normal_ips]]
ip['Ip'] = ips
ip['Label'] = ip_labels
FEAT_START_COL_IDX = len(ip.columns)
ip

Unnamed: 0,Ip,Label
0,147.32.80.9,1
1,147.32.84.165,1
2,147.32.96.69,1
3,199.185.220.200,1
4,209.181.247.105,1
...,...,...
4623,99.249.4.59,0
4624,99.254.24.249,0
4625,99.41.40.35,0
4626,99.59.106.184,0


# Generate features for each node IP, from subset of edges involving node IP

In [5]:
FEATURES = [
    'NumFlows',
    'NumUniqIps',
    'DurAvg', 'DurStd',
    'PktsAvg', 'PktsStd',
    'BytesAvg', 'BytesStd',
]

for f in FEATURES:
    ip[f] = 0

t()

count = 0
for tup in ip.itertuples():
    
    count += 1
    if count % 1000 == 0:
        t(f'Generated features for {count} nodes')

    idx, targ_ip = tup[0], tup[1]
    rows = df.loc[(df.SrcAddr == targ_ip) | (df.DstAddr == targ_ip)]
    
    '''
    Get feature set:
    - # unique ips corresponded with?
    - # flows involved
    - uniformity of time sequence?
    - distrib of start times
    - total duration
    - proportion tcp vs udp
    - proportion unusual ports
    - avg metrics, eg avg duration per flow, packets per flow
    '''
    
    dur = rows.Dur
    pkts = rows.TotPkts
    _bytes = rows.TotBytes
    
    targ_ip_features = [
        #'NumFlows',
        len(rows.index),
        #'NumUniqIps', don't count own IP
        len(np.unique(np.concatenate((rows.SrcAddr, rows.DstAddr)))) - 1,
        dur.mean(), dur.std(ddof=0),
        pkts.mean(), pkts.std(ddof=0),
        _bytes.mean(), _bytes.std(ddof=0),
    ]
    
    '''
    display(tup)
    display(rows)
    for F, f in zip(FEATURES, targ_ip_features):
        print(f'{F} = {f}')
    '''
    
    # Update node features
    ip.iloc[idx, FEAT_START_COL_IDX:] = targ_ip_features

t(f'FINISHED - Generated features for {count} nodes')

Generated features for 1000 nodes -> 4.6900341510772705 secs
Generated features for 2000 nodes -> 9.275033712387085 secs
Generated features for 3000 nodes -> 13.92999792098999 secs
Generated features for 4000 nodes -> 18.569058418273926 secs
FINISHED - Generated features for 4628 nodes -> 21.50999903678894 secs


In [6]:
ip

Unnamed: 0,Ip,Label,NumFlows,NumUniqIps,DurAvg,DurStd,PktsAvg,PktsStd,BytesAvg,BytesStd
0,147.32.80.9,1,5008,131,1.443974,71.520200,2.013778,0.595183,260.369209,99.066760
1,147.32.84.165,1,30,11,92.824127,256.590293,5.833333,16.569215,1505.766667,3426.205585
2,147.32.96.69,1,36,2,21.096620,70.539743,1.694444,0.659101,520.833333,917.563728
3,199.185.220.200,1,1,1,9.012997,0.000000,3.000000,0.000000,186.000000,0.000000
4,209.181.247.105,1,1,1,9.012883,0.000000,3.000000,0.000000,186.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
4623,99.249.4.59,0,1,1,26.799858,0.000000,21.000000,0.000000,1659.000000,0.000000
4624,99.254.24.249,0,1,1,3062.356689,0.000000,4.000000,0.000000,262.000000,0.000000
4625,99.41.40.35,0,1,1,1415.990356,0.000000,9.000000,0.000000,1437.000000,0.000000
4626,99.59.106.184,0,1,1,0.161090,0.000000,2.000000,0.000000,215.000000,0.000000


# Save node order + node features + ground truth labels

In [7]:
BASE, _ = TARG_NAME.split('.')
FEAT_NAME = f'node-{BASE}-sample_{SAMPLE}-features'
FEAT_FILE = os.path.join(TARG_DIR, FEAT_NAME)
ip.to_csv(FEAT_FILE, header=False, index=False)

# Generate adjacency matrix

In [12]:
# Lookup table: IP address -> index
ip['Idx'] = ip.index
ip_to_idx = dict(ip.loc[:, ('Ip', 'Idx')].to_dict('split')['data'])
ip_to_idx

{'147.32.80.9': 0,
 '147.32.84.165': 1,
 '147.32.96.69': 2,
 '199.185.220.200': 3,
 '209.181.247.105': 4,
 '216.32.180.22': 5,
 '38.229.70.20': 6,
 '70.89.117.194': 7,
 '77.242.119.40': 8,
 '78.40.125.4': 9,
 '94.245.120.86': 10,
 '1.148.95.229': 11,
 '1.228.0.231': 12,
 '1.252.111.154': 13,
 '1.66.104.27': 14,
 '1.66.104.89': 15,
 '1.72.2.164': 16,
 '10.10.20.248': 17,
 '101.102.50.45': 18,
 '101.11.212.152': 19,
 '101.118.55.74': 20,
 '101.12.50.219': 21,
 '101.140.6.162': 22,
 '105.142.55.2': 23,
 '107.10.18.139': 24,
 '107.8.132.120': 25,
 '108.116.153.231': 26,
 '108.16.206.11': 27,
 '108.17.116.149': 28,
 '108.36.154.213': 29,
 '108.43.150.8': 30,
 '108.6.32.7': 31,
 '108.62.152.141': 32,
 '108.65.140.6': 33,
 '109.104.160.44': 34,
 '109.107.200.178': 35,
 '109.111.7.49': 36,
 '109.115.197.142': 37,
 '109.116.208.151': 38,
 '109.117.145.225': 39,
 '109.121.226.3': 40,
 '109.121.240.33': 41,
 '109.123.210.131': 42,
 '109.123.211.195': 43,
 '109.123.211.197': 44,
 '109.123.211.210'

In [13]:
edges = df.loc[:, ['SrcAddr', 'DstAddr']].applymap(lambda x: ip_to_idx[x]+1) # +1 reqd for MATLAB-indexing
edges

Unnamed: 0,SrcAddr,DstAddr
0,2543,719
1,721,1
2,1516,719
3,764,1
4,4474,699
...,...,...
11205,776,1
11206,787,1
11207,667,1
11208,759,1


In [14]:
t()
adj = set()
for idx, ip1, ip2 in edges.itertuples():
    chk1, chk2 = (ip1, ip2), (ip2, ip1)
    if chk1 in adj or chk2 in adj:
        continue
    adj.add(chk1)
adj = pd.DataFrame(adj)
t('Translated edges')

Translated edges -> 0.017999649047851562 secs


# Save adj matrix in MATLAB-friendly format

In [15]:
BASE, _ = TARG_NAME.split('.')
ADJ_NAME = f'node-{BASE}-sample_{SAMPLE}-adjmatrix'
ADJ_FILE = os.path.join(TARG_DIR, ADJ_NAME)
adj.to_csv(ADJ_FILE, header=False, index=False)

# Generate copiable MATLAB code to:
* load features + labels + adjacency into workspace
* save workspace to .mat file

In [20]:
matlab = f'''
features = readtable('node-{BASE}-sample_{SAMPLE}-features');
Label = features(:, 2);
Attributes = features(:, 3:end);
edges = table2array(readtable('node-{BASE}-sample_{SAMPLE}-adjmatrix', 'Format','%u%u'));
Network = adjacency(graph(edges(:, 1), edges(:, 2)));
save('node-{BASE}-sample_{SAMPLE}-matlab');
'''
print(matlab)


features = readtable('node-capture20110815-sample_100-features');
Label = features(:, 2);
Attributes = features(:, 3:end);
edges = table2array(readtable('node-capture20110815-sample_100-adjmatrix', 'Format','%u%u'));
Network = adjacency(graph(edges(:, 1), edges(:, 2)));
save('node-capture20110815-sample_100-matlab');

