# CYBER

Computer networks generate massive amounts of heterogeneous data as users interact with other users, computers, and services. These interactions can be modeled as large, heterogeneous property graphs, with multidimensional characteristics of the communication embedded on an edge connecting nodes. Current techniques to identify subgraph evolution over time and extract anomalies require lengthy compute times or necessitate significant pre-filtering of the graph. In this tutorial, we showcase an approach to flagging anomalous network communications in a large graph using a combination of structural graph features and graph analytics, running end-to-end in RAPIDS.

# Imports

In [None]:
!pip install ipython-autotime
%load_ext autotime

In [None]:
# Import needed libraries
import cugraph
import cudf
from collections import OrderedDict
import numpy as np
import nvstrings

In [None]:
# %load strong_cc.py
#!/usr/bin/env python

# In[ ]:


# Stringly Connected Components

# Import needed libraries
import cugraph
import cudf
from collections import OrderedDict
import numpy as np


# ## Prep

# In[ ]:


# prep
# - create a second dataframe that contains a list of vertices and their connected component ID
# - the column name is set to 'src' to allow for joining, but it is really a list of all vertices 
# - Input:  
#      _coo  = the COO dataframe
#      _s    = symetermized => 1 = yes, 0 = no.  If no then 'src' and 'dst' need to be combined
def prep(_coo, s) :
    _vert = cudf.DataFrame()

    if s :
        _vert['src'] = _coo['src'].unique()
   
    else :
        # combine the source and destination 
        tmp = cudf.DataFrame()
        tmp['id'] = _coo['src'].append(_coo['dst'])
        _vert['src'] = tmp['id'].unique()
        
        del tmp

    # starting ID are vertex IDs
    _vert['cc'] = _vert['src']
    

    return _vert


# ## Loop

# In[ ]:


# define a kernel to update the row
# this will add an extra column indicating whether or not the value chnaged
def update_row(src, cc, cc2, out1, s2, kwargs) :
    for i, (a, b, c) in enumerate(zip(src, cc, cc2)) :
        if ( c == -1) :
            s2[i] = 0
        elif ( c < b ) :
            out1[i] = c
            s2[i] = 1
        else :
            out1[i] = b
            s2[i] = 0


# In[ ]:


def propogate(_coo, _vert) :
    # merge (join) on src
    tmp = _coo.merge(_vert, on=['src'], how='left')    
    
    # drop 'src' since we just push src values to 'dst'
    tmp = tmp.drop(['src'])
    
    aggs = OrderedDict()
    aggs['cc'] = 'min'
    
    t2 = tmp.groupby(['dst'], as_index=False).agg(aggs)    

    return t2


# In[ ]:


def update_cc(_vert, _tmp) :
    # need to rename the t2 columns
    t2 = _tmp.rename(columns={'dst':'src','cc': 'cc2'})
    
    #now merge _tmp in _vert
    _v = _vert.merge(t2, on=['src'], how='left') 
    
    z = _v.apply_rows(update_row, 
                        incols=['src', 'cc', 'cc2'],
                        outcols=dict(out1=np.int64, s2=np.bool),
                        kwargs=dict(kwargs=1)
                       )
    
    status = z['s2'].max()
    
    z.drop_column('cc')
    z.drop_column('cc2')
    z.drop_column('s2')
    z = z.rename(columns={'out1':'cc'})   
    
    return status, z


# In[ ]:


# Run the cuGraph Louvain analytic (using nvGRAPH function)
def strong_cc(coo_df):
    
    vert_gdf = prep(coo_df, False)
    status = 1
    loop = 0
    
    while status == 1 :
        loop = loop + 1

        tmp1 = propogate(coo_df, vert_gdf)
        status, vert_gdf = update_cc(vert_gdf, tmp1)  
    # end while

    
    return vert_gdf



# First Step - load and prep the data

In [None]:
#Download the four UNSW-NB15 data files
#!wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_1.csv
#!wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_2.csv
#!wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_3.csv
#!wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_4.csv

In [None]:
datafiles = [
    'UNSW-NB15_1.csv',
    'UNSW-NB15_2.csv',
    'UNSW-NB15_3.csv',
    'UNSW-NB15_4.csv'
]

In [None]:
cols = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbytes',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans_depth',
    'res_bdy_len',
    'Sjit',
    'Djit',
    'Stime',
    'Ltime',
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ ltm',
    'ct_src_dport_ltm',
    'ct_dst_sport_ltm',
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

col_dtypes = OrderedDict([
    ('srcip', 'str'),
    ('sport', 'int32'),
    ('dstip', 'str'),
    ('dsport', 'int64'),
    ('proto', 'str'),
    ('state', 'str'),
    ('dur', 'float64'),
    ('sbytes', 'int64'),
    ('dbytes', 'int64'),
    ('sttl', 'int64'),
    ('dttl', 'int64'),
    ('sloss', 'int64'),
    ('dloss', 'int64'),
    ('service', 'str'),
    ('Sload', 'float64'),
    ('Dload', 'float64'),
    ('Spkts', 'int64'),
    ('Dpkts', 'int64'),
    ('swin', 'int64'),
    ('dwin', 'int64'),
    ('stcpb', 'int64'),
    ('dtcpb', 'int64'),
    ('smeansz', 'int64'),
    ('dmeansz', 'int64'),
    ('trans_depth', 'int64'),
    ('res_bdy_len', 'int64'),
    ('Sjit', 'float64'),
    ('Djit', 'float64'),
    ('Stime', 'str'),
    ('Ltime', 'str'),
    ('Sintpkt', 'float64'),
    ('Dintpkt', 'float64'),
    ('tcprtt', 'float64'),
    ('synack', 'float64'),
    ('ackdat', 'float64'),
    ('is_sm_ips_ports', 'int8'),
    ('ct_state_ttl', 'int64'),
    ('ct_flw_http_mthd', 'int64'),
    ('is_ftp_login', 'int8'),
    ('ct_ftp_cmd', 'int64'),
    ('ct_srv_src', 'int64'),
    ('ct_srv_dst', 'int64'),
    ('ct_dst_ltm', 'int64'),
    ('ct_src_ ltm', 'int64'),
    ('ct_src_dport_ltm', 'int64'),
    ('ct_dst_sport_ltm', 'int64'),
    ('ct_dst_src_ltm', 'int64'),
    ('attack_cat', 'str'),
    ('Label', 'int8')
])

In [None]:
# read COO data.  Input are f = file name, sl = skip lines, d = delimiter
def read_data(f, c, dt) :
    print("reading " + f)
    return cudf.read_csv(f, names=c, delimiter=',', dtype=list(dt.values()) )

In [None]:
# this function will add two new columns to the dataframe
def ip_str_to_int(_gdf) :
    # convert the String IP addresses into integer values
    _gdf['src'] = _gdf['srcip'].str.ip2int()
    _gdf['dst'] = _gdf['dstip'].str.ip2int()


In [None]:
gdf1 = read_data(datafiles[0], cols, col_dtypes )
gdf2 = read_data(datafiles[1], cols, col_dtypes )
gdf3 = read_data(datafiles[2], cols, col_dtypes )
gdf4 = read_data(datafiles[3], cols, col_dtypes )

In [None]:
# merge the data sets together
gdf = cudf.concat([gdf1, gdf2, gdf3, gdf4])

In [None]:
# cleanup to reclaim space
del gdf1
del gdf2
del gdf3
del gdf4

In [None]:
# convert the String IP addresses into integer values
ip_str_to_int( gdf)

In [None]:
gdf['attack_cat'].null_count

# Next Step

In [None]:
gdf.head(2).to_pandas()

In [None]:
coo = cudf.DataFrame()
coo['src'] = gdf['src']
coo['dst'] = gdf['dst']

In [None]:
v = strong_cc(coo)

In [None]:
len(v['cc'].unique())

In [None]:
import networkx as nx

In [None]:
pdf = coo.to_pandas()

In [None]:
G = nx.DiGraph()

In [None]:
for i in range(len(pdf)) :
    G.add_edge(pdf['src'].iloc[i], pdf['dst'].iloc[i])

In [None]:
pdf['src'].iloc[0]