In [None]:
#
# Replicates xy creation for polars dataframe
# ... amd 7900x / 32G ram
# ... Python 3.11.6
#
import polars as pl
import numpy as np
import time
from rtsvg import *
rt = RACETrack()
ts0 = time.time()
# ... 46,138,310 netflow records
df = pl.concat([pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk1.csv'),
                pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk2.csv'),
                pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk3.csv')])
ts1 = time.time()
df = rt.columnsAreTimestamps(df, 'parsedDate')
ts2 = time.time()
# 13.0s load ... 0.5s fix time
#  7.0s load ... 0.6s fix time
#  6.8s load ... 0.5s fix time
print(f'Load time = {ts1-ts0:0.2f}s | Column fix time = {ts2-ts1:0.2f}s | {len(df)}') 

In [None]:
#
# Axis Creation Time
#
ts3 = time.time()
df, x_is_time, x_label_min, x_label_max, xT, x_order, x_min, x_max = rt.xyCreateAxisColumn(df, 'parsedDate',               False, 'x_norm')
ts4 = time.time()
df, y_is_time, y_label_min, y_label_max, yT, y_order, y_min, y_max = rt.xyCreateAxisColumn(df, 'firstSeenSrcIp',           False, 'y_norm')
ts5 = time.time()
df, z_is_time, z_label_min, z_label_max, zT, z_order, z_min, z_max = rt.xyCreateAxisColumn(df, 'firstSeenSrcPayloadBytes', False, 'z_norm')
ts6 = time.time()
# x-axs | y-axs | z-axs ... categoricals take the longest...
# ======|=======|======
# 0.37s | 2.49s | 1.21s
# 0.40s | 2.55s | 1.25s
# 0.37s | 2.51s | 1.29s
print(f'x_axis = {ts4-ts3:0.2f}s | y_axis = {ts5-ts4:0.2f}s | z_axis = {ts6-ts5:0.2f}s')

In [None]:
ts7 = time.time()
df = df.with_columns((10 + pl.col('x_norm')*512).alias("x_norm_px"))
df = df.with_columns([pl.col('x_norm_px').cast(pl.Int32)])
ts8 = time.time()
df = df.with_columns((10 + pl.col('y_norm')*512).alias("y_norm_px"))
df = df.with_columns([pl.col('y_norm_px').cast(pl.Int32)])
ts9 = time.time()
df = df.with_columns((522 - pl.col('z_norm')*512).alias("z_norm_px"))
ts10 = time.time()
# x-axs | y-axs | z-axs  ### Pixel alignment takes twice as long...
# ======|=======|======
# 0.25s | 0.25s | 0.12s
print(f'x_axis = {ts8-ts7:0.2f}s | y_axis = {ts9-ts8:0.2f}s | z_axis = {ts10-ts9:0.2f}s')

In [None]:
ts11 = time.time()
gb = df.group_by(['x_norm_px','y_norm_px'])
ts12 = time.time()
for k, k_df in gb:
    pass
ts13 = time.time()
# groupby | loop
# ========|=======
# 0.01s   | 5.58s
# 0.00s   | 5.51s
print(f'groupby = {ts12-ts11:0.2f} | loop = {ts13-ts12:0.2f}s')

In [None]:
ts14 = time.time()
pb = df.partition_by(['x_norm_px','y_norm_px'], as_dict=True)
ts15 = time.time()
pixels = 0
for x in pb.keys():
    pixels += 1
ts16 = time.time()
# partition_by | loop
# =============|=======
#  2.55s       | 0.00s
# 12.29s       | 0.00s # re-run always takes +10s more... don't know why...
#  2.56s       | 0.00s (107,563 pixels)
# 12.38s       | 0.00s
print(f'partition_by = {ts15-ts14:0.2f} | loop = {ts16-ts15:0.2f} | px = {pixels}')

In [None]:
ts17 = time.time()
px_only = df.group_by(['x_norm_px','y_norm_px']).agg(pl.count())
ts18 = time.time()
pixels = 0
for i in range(len(px_only)):
    pixels += 1
ts19 = time.time()
print(f'group_by = {ts18-ts17:0.2f} | loop = {ts19-ts18:0.2f} | px = {pixels}')
# group_by | loop
# =========|=======
# 0.19s    | 0.00s (107,563 pixels)
# 0.18s    | 0.00s