In [None]:
import pandas as pd
import polars as pl
import numpy as np
import heapq
import sys
sys.path.insert(1, '../rtsvg')
from rtsvg import *
rt = RACETrack()

In [None]:
df = pd.DataFrame({'fm':['a','a','b','b','b','c','e','f'],
                   'to':['b','d','a','c','d','d','d','d']})
params = {'df':df, 'relationships':[('fm','to')], 'draw_labels':True, 'txt_h':20, 'w':256, 'h':256, 'x_ins':10, 'y_ins':10}
rt.tile([rt.chordDiagram(                                           **params),
         rt.chordDiagram(link_style='wide',                         **params),
         rt.chordDiagram(                   equal_size_nodes=True,  **params),
         rt.chordDiagram(link_style='wide', equal_size_nodes=True,  **params)])

In [None]:
# 2024-03-08 - doesn't work ... module 'networkx' has no attribute 'to_scipy_sparse_matrix'
#
# Following from the documentation located here:
#
# https://markov-clustering.readthedocs.io/en/latest/readme.html
#

#import markov_clustering as mc
#import networkx as nx
#import random
## number of nodes to use
#numnodes = 200
## generate random positions as a dictionary where the key is the node id and the value
## is a tuple containing 2D coordinates
#positions = {i:(random.random() * 2 - 1, random.random() * 2 - 1) for i in range(numnodes)}
## use networkx to generate the graph
#network = nx.random_geometric_graph(numnodes, 0.3, pos=positions)
## then get the adjacency matrix (in sparse form)
#matrix = nx.to_scipy_sparse_matrix(network)

In [None]:
load_lots_of_netflow = True
if load_lots_of_netflow:
    _base_ = '../../data/2013_vast_challenge/mc3_netflow/nf/'
    df = pl.concat([pl.read_csv(_base_ + 'nf-chunk1.csv'),
                    pl.read_csv(_base_ + 'nf-chunk2.csv'),
                    pl.read_csv(_base_ + 'nf-chunk3.csv')])
    df = df.rename({'TimeSeconds':'secs',                  'parsedDate':'timestamp',                'dateTimeStr':'timestamp_str',
                    'ipLayerProtocol':'pro_str',           'ipLayerProtocolCode':'pro',             'firstSeenSrcIp':'sip',
                    'firstSeenDestIp':'dip',               'firstSeenSrcPort':'spt',                'firstSeenDestPort':'dpt',
                    'moreFragments':'mfrag',               'contFragments':'cfrag',                 'durationSeconds':'dur',
                    'firstSeenSrcPayloadBytes':'soct_pay', 'firstSeenDestPayloadBytes':'doct_pay',  'firstSeenSrcTotalBytes':'soct',
                    'firstSeenDestTotalBytes':'doct',      'firstSeenSrcPacketCount':'spkt',        'firstSeenDestPacketCount':'dpkt',
                    'recordForceOut':'out'})
    df = df.sample(100000)
else:
    df = pl.DataFrame({'sip':['1.2.3.4'], 'dip':['5.6.7.8']})
cd  = rt.chordDiagram(df, [('sip','dip')], equal_size_nodes=False, draw_labels=False, txt_h=16, w=200, h=200, x_ins=2, y_ins=2, dendrogram_algorithm='hdbscan')
cd2 = rt.chordDiagram(df, [('sip','dip')], equal_size_nodes=False, draw_labels=False, txt_h=16, w=200, h=200, x_ins=2, y_ins=2, dendrogram_algorithm=None)
rt.tile([cd,cd2])

In [None]:
cd.time_lu['dendrogram'], cd2.time_lu['dendrogram']

In [None]:
import hdbscan

handled   = set()
fmto_list = []
span_list = []
xs_list   = []
ys_list   = []
for node in cd.node_dir_arc:
    for fm in cd.node_dir_arc[node]:
        for to in cd.node_dir_arc[node][fm]:
            key = str(fm) + '|' + str(to)
            if key not in handled:
                handled.add(key)
                fm_span  = cd.node_dir_arc[node][fm][to]
                fm_coord = (fm_span[0]+fm_span[1])/720.0 - 0.5
                xs_list.append(fm_coord)
                if fm == node:
                    to_span  = cd.node_dir_arc[to][fm][to]
                    to_coord = (to_span[0]+to_span[1])/720.0 - 0.5
                    fmto_list.append(key)
                    span_list.append((fm_coord,to_coord))
                    ys_list.append(to_coord)
                else:
                    to_span  = cd.node_dir_arc[fm][fm][to]
                    to_coord = (to_span[0]+to_span[1])/720.0 - 0.5
                    fmto_list.append(key)
                    span_list.append((fm_coord,to_coord))
                    ys_list.append(to_coord)
            else:
                pass
                # print(f'"{key}" already handled')
clusterer = hdbscan.HDBSCAN()
clusterer.fit(span_list)
print("n_clusters =", len(set(clusterer.labels_)))
rt.xy(pd.DataFrame({'x':xs_list,'y':ys_list,'c':clusterer.labels_}),x_field='x',y_field='y',color_by='c')

In [None]:
import networkx as nx
fms, tos, children, all, parent_to_children = [],[], set(), set(), {}
for edge in clusterer.single_linkage_tree_.to_networkx().edges():
    _fm_, _to_ = int(edge[0]), int(edge[1])
    fms.append(_fm_), tos.append(_to_)
    children.add(_to_), all.add(_fm_), all.add(_to_)
    if _fm_ not in parent_to_children.keys():
        parent_to_children[_fm_] = set()
    parent_to_children[_fm_].add(_to_)
dfg  = pd.DataFrame({'fm':fms, 'to':tos})
root = (all - children).__iter__().__next__()

def __place__(pos, node, x0, x1, y): # whole bunch of nope :( ... doesn't give children enough space because it divides space in half everytime...
    pos[node] = ((x0+x1)/2.0, y)
    if node in parent_to_children.keys():
        num_of_children = len(parent_to_children[node])
        perc            = (x1-x0)*0.05
        x0 += perc
        x1 -= perc
        children_w = (x1 - x0)/num_of_children
        for child in parent_to_children[node]:
            __place__(pos, child, x0, x0+children_w, y-1)
            x0 += children_w
pos = {}
__place__(pos, root, -10.0, 10.0, 0)

def __leafWalk__(node):
    if node in parent_to_children.keys():
        ls = []
        for child in parent_to_children[node]:
            ls_children = __leafWalk__(child)
            if ls_children is not None:
                ls.extend(ls_children)
        return ls
    else:
        return [node]
leaves_in_order = __leafWalk__(root)
x, pos = 0.0, {}
for leaf in leaves_in_order:
    pos[leaf] = (x, 0)
    x += 1.0
for edge in clusterer.single_linkage_tree_.to_networkx().edges():
    _fm_, _to_ = int(edge[0]), int(edge[1])
    if _to_ in pos.keys():
        all_children_placed = True
        x_min, x_max = pos[_to_][0], pos[_to_][0]
        y_min, y_max = pos[_to_][1], pos[_to_][1]
        for child in parent_to_children[_fm_]:
            if child not in pos.keys():
                all_children_placed = False
            else:
                x_min, x_max = min(x_min, pos[child][0]), max(x_max, pos[child][0])
                y_min, y_max = min(y_min, pos[child][1]), max(y_max, pos[child][1])
        if all_children_placed:
            pos[_fm_] = (x_min, y_max + 1.0)
            for child in parent_to_children[_fm_]:
                pos[child] = (pos[child][0], y_max)
print(leaves_in_order)
rt.linkNode(dfg, [('fm','to')], pos, w=900, h=900, draw_labels=True, node_size='small', link_arrow=False)

In [None]:
_df_ = clusterer.condensed_tree_.to_pandas()
_df_[_df_['child_size'] == 1]['child']

In [None]:
def __dist__(a,b):
    print(a,b)
    return 1.0
my_hdbscan = hdbscan.HDBSCAN(metric=__dist__)
items = [(0,),(1,),(2,),(3,)]
my_hdbscan.fit(items)
my_hdbscan.labels_

In [None]:
my_dict = df.group_by(['sip','dip']).agg(pl.col('soct').sum()).partition_by(['sip','dip'], as_dict=True)
# my_dict[('172.30.0.4','10.7.7.10')]['soct'][0] # example value lookup
items_as_str = list(set(df['sip']) | set(df['dip']))
items        = [(int(x),) for x in range(len(items_as_str))]
def __dist__(ai,bi):
    a = items_as_str[int(ai[0])]
    b = items_as_str[int(bi[0])]
    if   (a,b) in my_dict:
        return 1.0 / my_dict[(a,b)]['soct'][0]
    elif (b,a) in my_dict:
        return 1.0 / my_dict[(b,a)]['soct'][0]
    else:
        return 10.0
my_hdbscan = hdbscan.HDBSCAN(metric=__dist__)
my_hdbscan.fit(items)
my_hdbscan.labels_