---

<!-- <a href="https://github.com/rraadd88/roux/blob/master/examples/roux_lib_df.ipynb"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 -->
 
## Fast processing.

## Tables

### Test data

In [None]:
!wget https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet -O /tmp/nyc_parking_violations_2022.parquet

#### CPU

In [1]:
import pandas as pd

In [2]:
%%time

df = pd.read_parquet(
    "/tmp/nyc_parking_violations_2022.parquet",
    columns=["Registration State", "Violation Description", "Vehicle Body Type", "Issue Date", "Summons Number"]
)

(df[["Registration State", "Violation Description"]]
 .value_counts()
 .groupby("Registration State")
 .head(1)
 .sort_index()
 .reset_index()
)

CPU times: user 7.54 s, sys: 1.51 s, total: 9.05 s
Wall time: 5 s


Unnamed: 0,Registration State,Violation Description,count
0,99,,17550
1,AB,14-No Standing,22
2,AK,PHTO SCHOOL ZN SPEED VIOLATION,125
3,AL,PHTO SCHOOL ZN SPEED VIOLATION,3668
4,AR,PHTO SCHOOL ZN SPEED VIOLATION,537
...,...,...,...
62,VT,PHTO SCHOOL ZN SPEED VIOLATION,3024
63,WA,21-No Parking (street clean),3732
64,WI,14-No Standing,1639
65,WV,PHTO SCHOOL ZN SPEED VIOLATION,1185


In [None]:
%%time

(df
 .groupby(["Vehicle Body Type"])
 .agg({"Summons Number": "count"})
 .rename(columns={"Summons Number": "Count"})
 .sort_values(["Count"], ascending=False)
)

In [None]:
%%time

weekday_names = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}

df["Issue Date"] = df["Issue Date"].astype("datetime64[ms]")
df["issue_weekday"] = df["Issue Date"].dt.weekday.map(weekday_names)

df.groupby(["issue_weekday"])["Summons Number"].count().sort_values()

#### GPU

In [3]:
get_ipython().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

In [2]:
%load_ext cudf.pandas

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


In [3]:
%%time

import pandas as pd

df = pd.read_parquet(
    "/tmp/nyc_parking_violations_2022.parquet",
    columns=["Registration State", "Violation Description", "Vehicle Body Type", "Issue Date", "Summons Number"]
)

(df[["Registration State", "Violation Description"]]
 .value_counts()
 .groupby("Registration State")
 .head(1)
 .sort_index()
 .reset_index()
)

CPU times: user 842 ms, sys: 871 ms, total: 1.71 s
Wall time: 1.75 s


Unnamed: 0,Registration State,Violation Description,count
0,99,,17550
1,AB,14-No Standing,22
2,AK,PHTO SCHOOL ZN SPEED VIOLATION,125
3,AL,PHTO SCHOOL ZN SPEED VIOLATION,3668
4,AR,PHTO SCHOOL ZN SPEED VIOLATION,537
...,...,...,...
62,VT,PHTO SCHOOL ZN SPEED VIOLATION,3024
63,WA,21-No Parking (street clean),3732
64,WI,14-No Standing,1639
65,WV,PHTO SCHOOL ZN SPEED VIOLATION,1185


In [None]:
%%time

(df
 .groupby(["Vehicle Body Type"])
 .agg({"Summons Number": "count"})
 .rename(columns={"Summons Number": "Count"})
 .sort_values(["Count"], ascending=False)
)

In [None]:
%%time

weekday_names = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}

df["Issue Date"] = df["Issue Date"].astype("datetime64[ms]")
df["issue_weekday"] = df["Issue Date"].dt.weekday.map(weekday_names)

df.groupby(["issue_weekday"])["Summons Number"].count().sort_values()

In [None]:
import sys
sys.exit(0)

### Test data

In [None]:
import pandas as pd
import numpy as np
import random
import string
import time

def generate_synthetic_pandas_data(num_rows, num_cols, include_strings=True, include_datetimes=True):
    """
    Generates a synthetic pandas DataFrame for performance testing.

    Args:
        num_rows (int): The number of rows for the DataFrame.
        num_cols (int): The base number of numerical columns.
        include_strings (bool): Whether to include a string column.
        include_datetimes (bool): Whether to include a datetime column.

    Returns:
        pd.DataFrame: A synthetic pandas DataFrame.
    """
    print(f"Generating a pandas DataFrame with {num_rows} rows...")
    start_time = time.time()

    data = {}

    # Numerical columns (integers and floats)
    for i in range(num_cols):
        if i % 2 == 0:
            data[f'int_col_{i}'] = np.random.randint(0, 100000, size=num_rows)
        else:
            data[f'float_col_{i}'] = np.random.rand(num_rows) * 1000

    # Categorical/Low-cardinality integer column
    data['category_id'] = np.random.randint(0, 50, size=num_rows) # 50 unique categories

    # Boolean column
    data['is_active'] = np.random.choice([True, False], size=num_rows)

    if include_strings:
        # String column (e.g., product codes, names)
        # Generate random strings of a fixed length
        string_length = 10
        random_strings = [''.join(random.choices(string.ascii_letters + string.digits, k=string_length))
                          for _ in range(num_rows)]
        data['string_col'] = random_strings

    if include_datetimes:
        # Datetime column
        start_date = pd.to_datetime('2020-01-01')
        end_date = pd.to_datetime('2025-12-31')
        time_diff = (end_date - start_date).total_seconds()
        random_seconds = np.random.rand(num_rows) * time_diff
        data['datetime_col'] = pd.to_datetime(start_date.timestamp() + random_seconds, unit='s')


    df = pd.DataFrame(data)
    end_time = time.time()
    print(f"Pandas DataFrame generation completed in {end_time - start_time:.2f} seconds.")
    print(f"DataFrame memory usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
    print("\nSample DataFrame Head:")
    # print(df.head(1))
    return df

df.head(1)

In [None]:
p='tests/input/roux_lib_df/large.pqt'

In [None]:
from pathlib import Path
if not Path(p).exists():
    df = generate_synthetic_pandas_data(
       num_rows=1000_000,
        num_cols=1,
       include_strings=True,
       include_datetimes=True,
    )
    
    from roux.lib.io import to_table
    to_table(
        df,
        p,    
    )
df.head(1)

#### CPU

In [None]:
get_ipython().kernel.do_shutdown(restart=True)

In [None]:
p='tests/input/roux_lib_df/large.pqt'
from roux.lib.io import read_table
df=read_table(p)
df.head(1)

In [None]:
from roux.lib.log import Logger
logging=Logger()

_time_start=logging.info('start',time=True)

print(
    df
    .groupby('category_id')
    .filter(lambda df: df['int_col_0'].sum()>0)
    ['is_active'].value_counts()
    .head()
)

logging.info('end',time=_time_start)

#### GPU

In [None]:
get_ipython().kernel.do_shutdown(restart=True)

In [None]:
import cudf.pandas
cudf.pandas.install()

import pandas as pd
print(pd)

In [None]:
p='tests/input/roux_lib_df/large.pqt'
if 'fast=cudf' in str(pd):
    import cudf
    df=cudf.read_parquet(p, engine="pyarrow")
df.head(1)

In [None]:
from roux.lib.log import Logger
logging=Logger()

_time_start=logging.info('start',time=True)

print(
    df
    .groupby('category_id')
    .apply(lambda df: df['int_col_0'].sum()>0)
    # ['is_active'].value_counts()
    .head()
)

logging.info('end',time=_time_start)

## Documentation
[`roux.lib.fast`](https://github.com/rraadd88/roux#module-rouxlibfast)