In [None]:
import pandas as pd
import polars as pl
import numpy  as np
import random
import time
import sys
sys.path.insert(1, '../rtsvg')
from rtsvg import *
rt = RACETrack()

In [None]:
_transforms_ = ['day_of_week','day_of_week_hour','year','year_quarter','quarter','month','year_month',
                'year_month_day','day','day_of_year','day_of_year_hour','hour','minute','second']
_timestamps_ = ['2023-04-03T14:32:01', '1970-02-03 03:12:31', '1983-12-31 02:10', '2004', '2010-09', '2015-11-03']
for _timestamp_ in _timestamps_:
    df_pl = pl.DataFrame({'ts':[_timestamp_]})
    df_pl = rt.columnsAreTimestamps(df_pl, 'ts')
    df_pd = pd.DataFrame({'ts':[_timestamp_]})
    df_pd = rt.columnsAreTimestamps(df_pd, 'ts')
    for _transform_ in _transforms_:
        tfield = rt.createTField('ts', _transform_)
        df_pl,_pl_tfield_  = rt.applyTransform(df_pl, tfield)
        df_pd,_pd_tfield_  = rt.applyTransform(df_pd, tfield)
        _pl_str_ = str(df_pl[0][_pl_tfield_][0])
        _pd_str_ = str(df_pd.iloc[0][_pd_tfield_])
        error_str = ''
        if _pl_str_ != _pd_str_:
            error_str = 'ERROR!!!'
            print(f'{tfield:32}{_pl_str_:16}{_pd_str_:16}{error_str:16}')
        pl_list = rt.transformNaturalOrder(df_pl, tfield)
        pd_list = rt.transformNaturalOrder(df_pd, tfield)
        if len(pl_list) != len(pd_list):
            print('Error - len(pl_list) != len(pd_list) for', _transform_)

In [None]:
ts0 = time.time()
timestamps = []
_sz_ = 10000
for i in range(_sz_):
    _yer_ = random.randint(2000,2025)
    _mon_ = random.randint(1,12)
    _day_ = random.randint(1,28)
    _hor_ = random.randint(0,23)
    _min_ = random.randint(0,59)
    _sec_ = random.randint(0,59)
    _ts_  = f'{_yer_:04}-{_mon_:02}-{_day_:02}T{_hor_:02}:{_min_:02}:{_sec_:02}'
    timestamps.append(_ts_)
ts1 = time.time()
print(f'randomized dataset creation time = {(ts1-ts0):0.2f} seconds ... length={len(timestamps)}')

In [None]:
#
# For 10M rows: // Includes dataframe creation time...
#
#                       Pandas    Polars
# day_of_week           32.70s     5.43s
# day_of_week_hour      34.27s     4.06s
# year                  17.24s     3.59s
# year_quarter          20.25s     4.42s
# quarter               16.50s     3.82s
# month                 29.47s     2.96s
# year_month            15.06s     3.40s
# year_month_day        14.42s     3.79s
# day                   14.10s     3.26s
# day_of_year           12.67s     3.08s
# day_of_year_hour      15.64s     3.43s
# hour                  12.81s     3.06s
# minute                13.74s     3.13s
# second                12.27s     3.06s

print('                      Pandas    Polars')
for _transform_ in _transforms_:
    tfield = rt.createTField('ts', _transform_)

    ts0_pl = time.time()
    df_pl = pl.DataFrame({'ts':timestamps})
    df_pl = rt.columnsAreTimestamps(df_pl, 'ts')
    df_pl,_pl_tfield_  = rt.applyTransform(df_pl, tfield)
    ts1_pl = time.time()

    ts0_pd = time.time()
    df_pd = pd.DataFrame({'ts':timestamps})
    df_pd = rt.columnsAreTimestamps(df_pd, 'ts')
    df_pd,_pd_tfield_  = rt.applyTransform(df_pd, tfield)
    ts1_pd = time.time()

    print(f'{_transform_:20}  {(ts1_pd-ts0_pd):0.2f}s     {(ts1_pl-ts0_pl):0.2f}s')


In [None]:
_dictionary_ = {'as_str':  ['a',  'b',  'c',   'd'],
                'as_int':  [1,    8,    3,     -4],
                'as_float':[1.2,  3.1,  10.2,  1e8],
                'as_mix1': ['a',  1,    2.3,   'b'], # Careful here -- polars drops the numbers (by default)
                'as_mix2': [1,    2,    3,     'a'], # Careful here -- polars drops the numbers (by default)
                'as_mix3': [1,    2.3,  'a',   'b']} # Careful here -- polars drops the numbers (by default)
df_pl = pl.DataFrame(_dictionary_) # mixed appears to default to string with nulls in place of the numbers...
df_pd = pd.DataFrame(_dictionary_)
for _column_ in ['as_str', 'as_int', 'as_float']:
    if rt.countBySet(df_pl, _column_)        != rt.countBySet(df_pd, _column_):
        print('Error for ', _column_)
    if rt.fieldIsArithmetic(df_pl, _column_) != rt.fieldIsArithmetic(df_pd, _column_):
        print('Error for ', _column_)        

In [None]:
_dictionary_ = {'color':    ['red', 'red', 'red', 'red', 'red', 'blue', 'blue', 'blue', 'yellow', 'orange', 'orange', 'pink', 'pink'],
                'by_set':   ['a',   'b',   'a',   'b',   'a',   'c',    'd',    'e',    'a',      'a',      'a',      'a',    'b'],
                'by_int':   [10,    20,    1,     30,    10,     1,     1,      2,      40,       15,       1,        3,      12],
                'by_float': [1.2,   1.4,   0.2,   1.9,   8.1,    9.22,  10.3,   14.5,   0.2,      0.93,     3.981,    40,     10]}
df_pl = pl.DataFrame(_dictionary_)
df_pd = pd.DataFrame(_dictionary_)

columns_to_include_none = list(df_pd.columns)
columns_to_include_none.append(None)
_columns_ = 0
for _color_by_ in columns_to_include_none:
    if _color_by_ is None:
        continue
    for _count_by_ in columns_to_include_none:
        for _by_set_ in [True,False]:
            _columns_ += 1
            pd_order = rt.colorRenderOrder(df_pd, _color_by_, _count_by_, _by_set_)
            pl_order = rt.colorRenderOrder(df_pl, _color_by_, _count_by_, _by_set_)
            if len(pd_order) != len(pl_order):
                print('Error -- Different Lengths!')
            else:
                for i in range(len(pd_order)):
                    if pd_order.index[i] != pl_order['index'][i] or \
                       pd_order.iloc[i]  != pl_order['count'][i]:
                        print(_color_by_, _count_by_, i, pd_order.index[i], pl_order['index'][i], pd_order.iloc[i], pl_order['count'][i])

In [None]:
_column_i_ = 0
_column_w_ = 10
_column_h_ = 100
_svg_      = f'<svg width="{_column_w_ * _columns_}" height="{2*_column_h_+10}">'
for _color_by_ in columns_to_include_none:
    if _color_by_ is None:
        continue
    for _count_by_ in columns_to_include_none:
        for _by_set_ in [True,False]:
            pd_order = rt.colorRenderOrder(df_pd, _color_by_, _count_by_, _by_set_)
            _svg_ += rt.colorizeBar(df_pd, pd_order, _color_by_, _count_by_, _by_set_,
                                    _column_i_ * _column_w_, _column_h_, _column_h_, _column_w_, False)
            pl_order = rt.colorRenderOrder(df_pl, _color_by_, _count_by_, _by_set_)
            _svg_ += rt.colorizeBar(df_pl, pl_order, _color_by_, _count_by_, _by_set_,
                                    _column_i_ * _column_w_, 8 + 2*_column_h_, _column_h_, _column_w_, False)
            _column_i_ += 1
_svg_ += '</svg>'
rt.displaySVG(_svg_)

In [None]:
_columns_  = 4 * 2
_column_i_ = 0
_column_w_ = 40
_column_h_ = 200
time_total_pd, time_total_pl = 0, 0
_svg_      = f'<svg width="{2 * _column_w_ * _columns_}" height="{_column_h_+10}">'
for i in range(4):
    n = 4 if i == 0 else 100 if i == 1 else 10000 if i == 2 else 100000
    _avg_ = 1000 + random.random() * 500
    _std_ = random.random() * 200
    _xs_  = np.random.default_rng().normal(_avg_, _std_, n)
    _cos_ = []
    for x in _xs_:
        _co_ = 'red' if x > _avg_ else 'blue'
        _cos_.append(_co_)
    df_pd = pd.DataFrame({'x':_xs_, 'c':_cos_})
    df_pl = pl.DataFrame({'x':_xs_, 'c':_cos_})
    for _style_ in ['boxplot', 'boxplot_w_swarm']:
        def yT(j):
            return 5 + _column_h_ - _column_h_ * j/max(_xs_)
        rt.co_mgr.type_color_lu['data']['default'] = '#ffffff'
        ts0_pd = time.time()
        _svg_ += rt.renderBoxPlotColumn(_style_, df_pd, _column_i_ * _column_w_ + _column_w_/2, 
                                        yT, max(_xs_), 0.0, _column_w_ - 20, 'x', 'c', 100)
        ts1_pd = time.time()
        rt.co_mgr.type_color_lu['data']['default'] = '#000000'
        ts0_pl = time.time()
        _svg_ += rt.renderBoxPlotColumn(_style_, df_pl, (_column_i_+1) * _column_w_ + _column_w_/2, 
                                        yT, max(_xs_), 0.0, _column_w_ - 20, 'x', 'c', 100)
        ts1_pl = time.time()

        time_total_pd += (ts1_pd - ts0_pd)
        time_total_pl += (ts1_pl - ts0_pl)
        _column_i_ += 2
_svg_ += '</svg>'
print('PD Time =', time_total_pd, ' | PL Time =', time_total_pl)
rt.displaySVG(_svg_)

In [None]:
#
# The week specific groupby is ***ed... so this tests out variations of how to make it work...
#
_ts_, _td_, d = pd.to_datetime('2023-01-01'), pd.Timedelta(days=1), 0.0
timestamps, colors, counts = [], [], []
for i in range(364):
    timestamps.append(_ts_), counts.append(2.4 + sin(d)),            colors.append('red')
    timestamps.append(_ts_), counts.append(2.8 + cos(d)),            colors.append('green')
    timestamps.append(_ts_), counts.append(3   + cos(d) + 2*sin(d)), colors.append('blue')
    d    += pi/16
    _ts_ += _td_
df_pl = pl.DataFrame(pd.DataFrame({'timestamp':timestamps, 'color':colors, 'count':counts}))

_sorted_ = df_pl.sort('timestamp')
print(df_pl['timestamp'][0], ' -> ', df_pl['timestamp'][-1], ' [ACTUAL]')
df_ends  = pl.DataFrame({'ts':[df_pl['timestamp'][0], df_pl['timestamp'][-1]]}).sort('ts')

print()
_ends_first_, _ends_last_ = None, None
for k, k_df in df_ends.group_by_dynamic('ts', every='1w'):
    if _ends_first_ is None:
        _ends_first_ = k
    _ends_last_ = k

print(_ends_first_, ' -> ', _ends_last_, ' [ENDS, GB CALC]')

_upsampled_             = df_ends.upsample('ts', every='1w', offset=(_ends_first_ - df_pl['timestamp'][0]))
_first_, _last_, _bins_ = _upsampled_['ts'][0], _upsampled_['ts'][-1], len(_upsampled_)
print()
print(_first_, ' -> ', _last_, _bins_)