In [None]:
import cudf
import numpy as np
import pyarrow as pa
from math import ceil
import graphistry as g

In [None]:
df = cudf.read_csv('data/Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv')
pdf = df.to_pandas()

In [3]:
print(len(df))
print(df.columns)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

In [4]:
# help(g.hypergraph)

Help on function hypergraph in module graphistry.pygraphistry:

hypergraph(raw_events, entity_types=None, opts={}, drop_na=True, drop_edge_attrs=False, verbose=True, direct=False)
    Transform a dataframe into a hypergraph.
    
    :param Dataframe raw_events: Dataframe to transform
    :param List entity_types: Optional list of columns (strings) to turn into nodes, None signifies all
    :param Dict opts: See below
    :param bool drop_edge_attrs: Whether to include each row's attributes on its edges, defaults to False (include)
    :param bool verbose: Whether to print size information
    :param bool direct: Omit hypernode and instead strongly connect nodes in an event
    
    Create a graph out of the dataframe, and return the graph components as dataframes, 
    and the renderable result Plotter. It reveals relationships between the rows and between column values.
    This transform is useful for lists of events, samples, relationships, and other structured high-dimensional dat

In [5]:
# WARNING -- TAKES A LONG TIME AND 55+ GiB OF HOST MEMORY
h = g.hypergraph(pdf, opts={
    'EVENTID': 'Label',
})

# links 26157041
# events 331125
# attrib entities 2433374


In [6]:
print(h['nodes'])

         Dst Port     nodeTitle      type              nodeID  Protocol  \
0               0             0  Dst Port         Dst Port::0       NaN   
1              67            67  Dst Port        Dst Port::67       NaN   
2             137           137  Dst Port       Dst Port::137       NaN   
3            5355          5355  Dst Port      Dst Port::5355       NaN   
4        Dst Port      Dst Port  Dst Port  Dst Port::Dst Port       NaN   
...           ...           ...       ...                 ...       ...   
2764494  Dst Port  Label::Label     Label        Label::Label  Protocol   
2764495  Dst Port  Label::Label     Label        Label::Label  Protocol   
2764496  Dst Port  Label::Label     Label        Label::Label  Protocol   
2764497  Dst Port  Label::Label     Label        Label::Label  Protocol   
2764498  Dst Port  Label::Label     Label        Label::Label  Protocol   

         Timestamp  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0              NaN            NaN 

In [7]:
print(h['edges'])

          Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp  \
0         52500000             0                 0  01/03/2018 08:17:11   
1         61000000             0                 0  01/03/2018 08:20:07   
2          7999725             0                 0  01/03/2018 08:17:18   
3         61000000             0                 0  01/03/2018 08:22:09   
4         61000000             0                 0  01/03/2018 08:24:11   
...            ...           ...               ...                  ...   
26157036  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
26157037  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
26157038  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
26157039  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
26157040  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   

          Pkt Len Std  Fwd Pkts/b Avg  Fwd Pkt Len Max  Fwd IAT Std  \
0                   0       

In [12]:
print(h['graph'])

{'bindings': {'edges':           Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp  \
  0         52500000             0                 0  01/03/2018 08:17:11   
  1         61000000             0                 0  01/03/2018 08:20:07   
  2          7999725             0                 0  01/03/2018 08:17:18   
  3         61000000             0                 0  01/03/2018 08:22:09   
  4         61000000             0                 0  01/03/2018 08:24:11   
  ...            ...           ...               ...                  ...   
  26157036  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
  26157037  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
  26157038  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
  26157039  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
  26157040  Idle Min  FIN Flag Cnt  Subflow Bwd Pkts            Timestamp   
  
            Pkt Len Std  Fwd Pkts/b Avg  Fwd Pkt Len

In [8]:
nodes_df = cudf.DataFrame.from_pandas(h['nodes'][['nodeID', 'type']])
nodes_df = cudf.DataFrame({
    'name': nodes_df['nodeID'].astype('category'),
    'type': nodes_df['type'].astype('category'),
})

edges_df = cudf.DataFrame.from_pandas(h['edges'][['attribID', 'Label']])
edges_df = cudf.DataFrame({
    'src': edges_df['attribID'].astype('category'),
    'dst': edges_df['Label'].astype('category')
})

In [None]:
# In case we need to pause and write to disk to avoid OOM'ing cuDF
# def write_arrow(df, path):
#     table = df.to_arrow()
#     writer = pa.RecordBatchStreamWriter(path, table.schema)
#     writer.write_table(table)
# #     writer.write_table(pa.Table.from_batches(table.to_batches(max_chunksize=len(table) / 1000)))
#     writer.close()

# write_arrow(nodes_df, 'data/Thursday-01-03-2018_TrafficForML_CICFlowMeter.nodes.arrow')
# write_arrow(edges_df, 'data/Thursday-01-03-2018_TrafficForML_CICFlowMeter.edges.arrow')

In [10]:
def combine_cats(*cols):
    cols = [col.astype('category') for col in cols]
    cats = cudf \
        .concat([col.cat.categories for col in cols]) \
        .to_series().drop_duplicates(ignore_index=True) \
        ._column
    codes_dtype = np.find_common_type([col.cat.codes.dtype for col in cols], [])
    cols = [
        col.cat._set_categories(
            col.cat.categories, cats, is_unique=True
        ) for col in cols
    ]
    return [
        cudf.Series(cudf.core.column.build_categorical_column(
            size=col.size,
            offset=col.offset,
            mask=col.base_mask,
            ordered=col.dtype.ordered,
            categories=col.cat().categories,
            codes=col.cat().codes.astype(codes_dtype)
        )) for col in cols
    ]

node_indices_col = cudf.core.index.RangeIndex(0, len(nodes_df))
(
    src_col,
    dst_col,
    node_name_col
) = combine_cats(edges_df['src'], edges_df['dst'], nodes_df['name'])

src_df = cudf.DataFrame({ 'src': node_name_col, 'id': node_indices_col })
src_df = cudf.DataFrame({ 'src': src_col }) \
    .merge(src_df, on='src', how='left') \
    .rename({'id': 'src', 'src': 'node'})

print(src_df.head())

dst_df = cudf.DataFrame({ 'dst': node_name_col, 'id': node_indices_col })
dst_df = cudf.DataFrame({ 'dst': dst_col }) \
    .merge(dst_df, on='dst', how='left') \
    .rename({'id': 'dst', 'dst': 'node'})

print(dst_df.head())

def type_to_color(types):
    color_indices = cudf.Series(types.cat.codes)
    color_palette = cudf.Series([
        -12451426,-11583787,-12358156,-10375427,
        -7610114,-4194305,-6752794,-5972565,
        -5914010,-4356046,-6140066
    ])
    color_palettes = []
    num_color_ids = color_indices.max() + 1
    for i in range(ceil(num_color_ids / len(color_palette))):
        color_palettes.append(color_palette)
    return cudf.Series(cudf.core.column.build_categorical_column(
        ordered=True,
        codes=color_indices._column,
        categories=cudf.concat(color_palettes)[:num_color_ids],
    ).as_numerical_column(dtype=np.int32))

nodes_df = cudf.DataFrame({
    'id': node_indices_col,
    'name': node_name_col,
    'type': nodes_df['type'],
    'color': type_to_color(nodes_df['type'])
})

edges_df = cudf.DataFrame({
    'src': src_df['src'],
    'dst': dst_df['dst'],
})

print(nodes_df.head())
print(edges_df.head())