In [None]:
import pandas as pd
import polars as pl
import numpy  as np
import networkx as nx
from math import sin, cos, pi, sqrt, atan2
from os.path import exists
import time
from rtsvg import *
rt = RACETrack()
ts1 = time.time()
df = pl.concat([pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk1.csv'),
                pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk2.csv'),
                pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk3.csv')])
df = rt.columnsAreTimestamps(df, 'parsedDate')
ts2 = time.time()
print(f'Loading Time ... {ts2 - ts1:0.2} sec')

df = df.drop(['TimeSeconds',
              #'parsedDate',
              'dateTimeStr',
              #'ipLayerProtocol',
              'ipLayerProtocolCode',
              #'firstSeenSrcIp',
              #'firstSeenDestIp',
              #'firstSeenSrcPort',
              #'firstSeenDestPort',
              'moreFragments',
              'contFragments',
              #'durationSeconds',
              'firstSeenSrcPayloadBytes',
              'firstSeenDestPayloadBytes',
              #'firstSeenSrcTotalBytes',
              #'firstSeenDestTotalBytes',
              #'firstSeenSrcPacketCount',
              #'firstSeenDestPacketCount',
              'recordForceOut'])

df = df.rename({'parsedDate':'ts',
               'ipLayerProtocol':'pro',
               'firstSeenSrcIp':'sip',
               'firstSeenDestIp':'dip',
               'firstSeenSrcPort':'spt',
               'firstSeenDestPort':'dpt',
               'durationSeconds':'dur',
               'firstSeenSrcTotalBytes':'soct',
               'firstSeenDestTotalBytes':'doct',
               'firstSeenSrcPacketCount':'spkt',
               'firstSeenDestPacketCount':'dpkt'})

print('total nodes = ', len(set(df['sip']) | set(df['dip'])))
df.sample(3)

In [2]:
layout_file = '../../data/2013_vast_challenge/mc3_netflow/spring_layout.csv'
relates = [('sip','dip')]
g       = rt.createNetworkXGraph(df, relates)
pos = {} if exists(layout_file) else nx.spring_layout(g)
#rt.link(df, relates, pos)

In [None]:
df_uniqs = df.unique(subset=['sip','dip'])
total_nodes = len(set(df_uniqs['sip']) | set(df_uniqs['dip']))
print(f'{len(df_uniqs)=} | {total_nodes=}')
_igl_ = rt.interactiveGraphLayout(df_uniqs, {'relationships':relates, 'pos':pos, 'draw_labels':False, 'bounds_percent':0.02}, w=1200, h=800)
if exists(layout_file): _igl_.loadLayout(layout_file)
#_igl_

In [4]:
#_igl_.saveLayout(layout_file)
#print(_igl_)

In [5]:
# Collapse the original graph into one based on position of nodes in xy coordinates
g      = rt.createNetworkXGraph(df_uniqs, relates)
#_link_ = rt.link(df_uniqs, relates, pos, link_size=None, w=1200, h=900)
_link_ = rt.link(df_uniqs, relates, pos, w=1200, h=900)
_link_.renderSVG() # force a render so that xT and yT exist
all_nodes    = set(df_uniqs['sip']) | set(df_uniqs['dip'])
all_nodes_ls = list(all_nodes)
x_min, y_min, x_max, y_max = 1e9, 1e9, -1e9, -1e9 # probably not a safe assumption :(
node_to_xy = {}
xy_to_node = {}
for node in all_nodes:
    x,y              = _link_.xT(pos[node][0]), _link_.yT(pos[node][1])
    xy               = (x,y)
    x_min, x_max     = min(x_min, x), max(x_max, x)
    y_min, y_max     = min(y_min, y), max(y_max, y)
    node_to_xy[node] = xy
    if xy not in xy_to_node: xy_to_node[xy] = []
    xy_to_node[xy].append(node)
#_link_

In [6]:
_not_useful_ = '''
import hdbscan
vecs  = []
xy_lu = {'x':[], 'y':[], 'c':[]}
for node in all_nodes_ls:
    xy  = node_to_xy[node]
    v_i = [(xy[0]-x_min)/(x_max-x_min), (xy[1]-y_min)/(y_max-y_min)]
    vecs.append(v_i)
    xy_lu['x'].append(xy[0]), xy_lu['y'].append(xy[1])
clusterer = hdbscan.HDBSCAN()
clusterer.fit(vecs)
for c in clusterer.labels_: xy_lu['c'].append(c)
rt.xy(pl.DataFrame(xy_lu), x_field='x', y_field='y', color_by='c', w=800, h=600)
'''

In [7]:
_degree_analysis_ = '''
degrees_lu = {'degree':[]}
for node in all_nodes_ls: degrees_lu['degree'].append(g.degree(node))
rt.histogram(pl.DataFrame(degrees_lu), bin_by='degree')
degree_histogram_lu = {'threshold':[], 'count':[]}
for i in range (500, 1000):
    count = 0
    for node in all_nodes_ls:
        if g.degree(node) >= i: count += 1
    degree_histogram_lu['threshold'].append(i), degree_histogram_lu['count'].append(count)
rt.xy(pl.DataFrame(degree_histogram_lu), x_field='threshold', y_field='count')
'''

In [8]:
_not_this_ = '''
# Degree sorting (and node indexing from 0...N-1)
degree_sorter = []
for node in all_nodes_ls: degree_sorter.append((g.degree(node), node))
degree_sorter.sort(reverse=True)
node_index, rev_node_index = {}, {}
for i, node in enumerate(all_nodes_ls): node_index[node], rev_node_index[i] = i, node

# Only create enough nborhoods to cover distinguishable colors
distinguishable_colors = rt.co_mgr.colorgoricalColors()
my_origins             = []
for i in range(len(distinguishable_colors)): my_origins.append(i) # nodes with the highest degree

# Raster creation
_link_ = rt.link(df_uniqs, relates, pos, w=600,h=400)
_link_.renderSVG() # force a render so that xT and yT exist
w,h = _link_.w, _link_.h
my_raster = [[None for x in range(w)] for y in range(h)]
for _node_ in pos.keys():
    x, y = int(_link_.xT(pos[_node_][0])), int(_link_.yT(pos[_node_][1]))
    if my_raster[y][x] is None: my_raster[y][x] = set()
    my_raster[y][x].add(node_index[_node_])

# Perform the balanced level set algorithm
# ... 33.6s @ 1200x900 (amd 7900x w/ 96g memory)
# ...  7.2s @  600x400 (amd 7900x w/ 96g memory)
# ...  5.3s @  500x350 (amd 7900x w/ 96g memory)
# ...  3.2s @  400x300 (amd 7900x w/ 96g memory)
# ...  1.8s @  300x200 (amd 7900x w/ 96g memory)
t0 = time.time()
my_state, my_found_time, my_finds, my_progress_lu = rt.levelSetBalanced(my_raster, my_origins, 0)
t1 = time.time()
print(f'{t1-t0:0.2f}s')
'''

In [9]:
_debug_step_ = '''
rt.tile([rt.levelSetStateAndFoundTimeSVG(my_state,my_found_time),
         rt.xy(pl.DataFrame(my_progress_lu), x_field='iteration', y_field='heapsize', color_by='origin', dot_size='tiny', w=1024, h=128)], horz=False)
'''

In [10]:
_paired_with_not_this_ = '''
my_convex_hulls = {}
group_i = 0
for x in my_finds:
    my_convex_hulls[f'group {group_i}'] = []
    for node_id in my_finds[x]:
        node = rev_node_index[node_id]
        my_convex_hulls[f'group {group_i}'].append(node)
    rt.co_mgr.str_to_color_lu[f'group {group_i}'] = distinguishable_colors[group_i]
    group_i += 1

_link_ = rt.link(df_uniqs, relates, pos, link_size=None, node_size='small', w=900, h=600, convex_hull_lu=my_convex_hulls)
_link_
'''

In [11]:
_voronoi_version_ = '''
# Degree sorting (and node indexing from 0...N-1)
degree_sorter = []
for node in all_nodes_ls: degree_sorter.append((g.degree(node), node))
degree_sorter.sort(reverse=True)
node_index, rev_node_index = {}, {}
for i, node in enumerate(all_nodes_ls): node_index[node], rev_node_index[i] = i, node

# Only create enough nborhoods to cover distinguishable colors
distinguishable_colors = rt.co_mgr.brewerColors('qualitative', 12)
my_origins             = []
for i in range(len(distinguishable_colors)):
    rt.co_mgr.str_to_color_lu[i] = distinguishable_colors[i]
    my_origins.append(i) # nodes with the highest degree

# Voronoi creation
_link_ = rt.link(df_uniqs, relates, pos, w=600,h=400)
_link_.renderSVG() # force a render so that xT and yT exist
w,h = _link_.w, _link_.h
voronoi_pts = []
for i in range(len(distinguishable_colors)):
    _node_ = degree_sorter[i][1]
    x, y = _link_.xT(pos[_node_][0]), _link_.yT(pos[_node_][1])
    voronoi_pts.append((x,y))
nborhoods = rt.isedgarVoronoi(voronoi_pts, Box=[(0,h),(w,h),(w,0),(0,0)])
_v_svg_ = [f'<svg x="0" y="0" width="{w}" height="{h}"/>']
for _poly_i_ in range(len(nborhoods)):
    _poly_ = nborhoods[_poly_i_]
    d = f'M {_poly_[0][0]} {_poly_[0][1]}  '
    for i in range(1, len(_poly_)): d += f'L {_poly_[i][0]} {_poly_[i][1]} '
    d += 'Z'
    _v_svg_.append(f'<path d="{d}" fill="{rt.co_mgr.str_to_color_lu[_poly_i_]}" />')
_v_svg_.append('</svg>')
rt.tile([''.join(_v_svg_)])
'''

In [None]:
# k-means version
import random
_node_ls_  = list(pos.keys())
_node_to_i_ = {}
for i in range(len(_node_ls_)): _node_to_i_[_node_ls_[i]] = i
_node_sxy_ = []
for _node_ in _node_ls_: 
    sx, sy = _link_.xT(pos[_node_][0]), _link_.yT(pos[_node_][1])
    _node_sxy_.append((sx, sy))
sx_min, sx_max, sy_min, sy_max = sx, sx, sy, sy
for _node_ in _node_ls_:
    sx_min, sx_max = min(sx_min, sx), max(sx_max, sx)
    sy_min, sy_max = min(sy_min, sy), max(sy_max, sy)

# Distinguishable colors
distinguishable_colors = rt.co_mgr.brewerColors('qualitative', 10)

# Parameters for K-Means
k, iters = len(distinguishable_colors), 100

# Make random cluster centers
cluster_centers = {}
for i in range(k): 
    sx, sy = random.random() * (sx_max - sx_min) + sx_min, random.random() * (sy_max - sy_min) + sy_min
    cluster_centers[i] = (sx, sy)

# Iterate K-Means
for _iter_ in range(iters):
    # Assign nodes to their closest center
    center_assignments = {}
    for j in range(len(_node_ls_)):
        _node_ = _node_ls_[j]
        min_dist       = (_node_sxy_[j][0] - cluster_centers[0][0])**2 + (_node_sxy_[j][1] - cluster_centers[0][1])**2
        closest_center = 0
        for i in range(1, k):
            dist = (_node_sxy_[j][0] - cluster_centers[i][0])**2 + (_node_sxy_[j][1] - cluster_centers[i][1])**2
            if dist < min_dist:
                min_dist       = dist
                closest_center = i
        if closest_center not in center_assignments: center_assignments[closest_center] = []
        center_assignments[closest_center].append(_node_)
    # If there are any centers without nodes, assign a random node to them
    for i in range(k): 
        if i not in center_assignments: center_assignments[i] = [random.choice(_node_ls_)]
    # Update centers
    for i in range(k): 
        sx, sy = 0, 0
        for _node_ in center_assignments[i]: 
            sx, sy = sx + _node_sxy_[_node_to_i_[_node_]][0], sy + _node_sxy_[_node_to_i_[_node_]][1]
        cluster_centers[i] = (sx/len(center_assignments[i]), sy/len(center_assignments[i]))

voronoi_pts = []
for i in cluster_centers: voronoi_pts.append(cluster_centers[i])

nborhoods = rt.isedgarVoronoi(voronoi_pts, Box=[(0,_link_.h),(_link_.w,_link_.h),(_link_.w,0),(0,0)])
voronoi_svg = [f'<svg x="0" y="0" width="{_link_.w}" height="{_link_.h}"/><g opacity="0.4">']
for _poly_i_ in range(len(nborhoods)):
    _poly_ = nborhoods[_poly_i_]
    d = f'M {_poly_[0][0]} {_poly_[0][1]}  '
    for i in range(1, len(_poly_)): d += f'L {_poly_[i][0]} {_poly_[i][1]} '
    d += 'Z'
    voronoi_svg.append(f'<path d="{d}" fill="{distinguishable_colors[_poly_i_]}" />')
voronoi_svg.append('</g></svg>')

k_svg = []
for i in cluster_centers:
    sx, sy = cluster_centers[i][0], cluster_centers[i][1]
    k_svg.append(f'<circle cx="{sx}" cy="{sy}" r="5" fill="{distinguishable_colors[i]}"/>')
    k_svg.append(f'<text x="{sx}" y="{sy+16} stroke="#ff0000" text-anchor="middle">{len(center_assignments[i])}</text>')

# rt.tile(['<svg w="{_link_.w}" h="{_link_.h}">' + '<g opacity="0.8">' + _link_.renderSVG() + '</g>' + ''.join(voronoi_svg) + ''.join(k_svg) + '</svg>'])
rt.tile(['<svg w="{_link_.w}" h="{_link_.h}">' + ''.join(voronoi_svg) + ''.join(k_svg) + '</svg>'])

In [13]:
nodes_in_cluster_svg   = [f'<svg x="0" y="0" width="{_link_.w}" height="{_link_.h}">']
node_to_cluster_center = {}
for i in center_assignments:
    for _node_ in center_assignments[i]: 
        sx, sy = _link_.xT(pos[_node_][0]), _link_.yT(pos[_node_][1])
        nodes_in_cluster_svg.append(f'<circle cx="{sx}" cy="{sy}" r="3" fill="{distinguishable_colors[i]}"/>')
        node_to_cluster_center[_node_] = i
nodes_in_cluster_svg.append('</svg>')
# rt.tile([''.join(nodes_in_cluster_svg)]) # this just matches the voronoi diagram from the last step

In [14]:
df = df.with_columns(pl.col('dip').replace_strict(node_to_cluster_center).alias('dip_cluster'))
df_degree_count     = df.unique(['sip','dip','dip_cluster']).group_by(['sip','dip_cluster']).agg(pl.len())
#df_connection_count = df.group_by(['sip','dip_cluster']).agg(pl.len())
color_order = rt.colorRenderOrder(df_degree_count, 'dip_cluster', 'len')

In [15]:
def shrinkPoints(center_xy, xys, r=4):
    ret = []
    for xy in xys:
        uv = rt.unitVector((xy, center_xy))
        ret.append((xy[0] + uv[0]*r, xy[1] + uv[1]*r))
    return ret

voronoi_fills_svg = [f'<svg x="0" y="0" width="{_link_.w}" height="{_link_.h}"/>']
voronoi_edges_svg = [f'<svg x="0" y="0" width="{_link_.w}" height="{_link_.h}"/>']
for _poly_i_ in range(len(nborhoods)):
    center_xy = cluster_centers[_poly_i_]
    # _poly_ = shrinkPoints(center_xy, nborhoods[_poly_i_])
    _poly_ = nborhoods[_poly_i_]
    d = f'M {_poly_[0][0]} {_poly_[0][1]}  '
    for i in range(1, len(_poly_)): d += f'L {_poly_[i][0]} {_poly_[i][1]} '
    d += 'Z'
    voronoi_edges_svg.append(f'<path d="{d}" fill="none" stroke-width="5" stroke="{distinguishable_colors[_poly_i_]}" />')
    voronoi_fills_svg.append(f'<path d="{d}" fill="{distinguishable_colors[_poly_i_]}" />')
voronoi_edges_svg.append('</svg>')
voronoi_fills_svg.append('</svg>')
#rt.tile([''.join(voronoi_fills_svg)])

In [None]:
for i in range(len(distinguishable_colors)):
    rt.co_mgr.str_to_color_lu[i]      = distinguishable_colors[i]
    rt.co_mgr.str_to_color_lu[str(i)] = distinguishable_colors[i]

glyph_svgs = [f'<svg x="0" y="0" width="{_link_.w}" height="{_link_.h}">']
for i in center_assignments:
    _nodes_ = set(center_assignments[i])
    _df_    = df_degree_count.filter(pl.col('sip').is_in(_nodes_))
    for k, k_df in _df_.group_by('sip'):
        sx, sy = _link_.xT(pos[k[0]][0]), _link_.yT(pos[k[0]][1])
        _within_  = k_df.filter( pl.col('dip_cluster') == i)
        _without_ = k_df.filter(~pl.col('dip_cluster') != i)
        glyph_svg = rt.concentricGlyph(_without_, sx, sy, 0.0, len(set(_within_['dip_cluster']))/len(_nodes_), order=color_order, nbor='dip_cluster', count_by='len')
        glyph_svgs.append(glyph_svg)
glyph_svgs.append('</svg>')
rt.tile([f'<svg x="0" y="0" width="{_link_.w}" height="{_link_.h}">' + 
         ''.join(voronoi_fills_svg) + 
         ''.join(glyph_svgs) + 
         '</svg>'])

In [None]:
_link_