In [2]:
# !pip install fireducks polars modin[ray]

In [14]:
import numpy as np
import pandas as pd
import fireducks.pandas as fd
import modin.pandas as md
import polars as pl
import time

np.random.seed(42)

num_rows = 1_000_000
data = {
    'A': np.random.rand(num_rows),
    'B': np.random.rand(num_rows),
    'C': np.random.choice([1, 2, 3, np.nan], size=num_rows)
}
data['C'][data['C'] < 0.1] = np.nan

# Save data to a CSV file for benchmarking
df_initial = pd.DataFrame(data)
df_initial.to_csv('benchmark_data.csv', index=False)

def time_function(func):
    start_time = time.time()
    func()
    return time.time() - start_time

def load_data():
    df_pd = pd.read_csv('benchmark_data.csv')
    df_pl = pl.read_csv('benchmark_data.csv')
    df_md = md.read_csv('benchmark_data.csv')
    df_fd = fd.read_csv('benchmark_data.csv')
    return df_pd, df_pl, df_md, df_fd

def drop_na(df):
    if isinstance(df, pd.DataFrame):
        return df.dropna()
    elif isinstance(df, pl.DataFrame):
        return df.drop_nulls()
    elif isinstance(df, md.DataFrame):
        return df.dropna()
    elif isinstance(df, fd.DataFrame):
        return df.dropna()

def filter_data(df):
    if isinstance(df, pd.DataFrame) or isinstance(df, md.DataFrame) or isinstance(df, fd.DataFrame):
        return df[df['C'] > 1]
    elif isinstance(df, pl.DataFrame):
        return df.filter(pl.col('C') > 1)

def sort_data(df):
    if isinstance(df, pd.DataFrame) or isinstance(df, md.DataFrame) or isinstance(df, fd.DataFrame):
        return df.sort_values(by='A')
    elif isinstance(df, pl.DataFrame):
        return df.sort('A')

def benchmark():
    df_pd, df_pl, df_md, df_fd = load_data()

    libraries = {
        "Pandas": df_pd,
        "Polars": df_pl,
        "Modin": df_md,
        "FireDucks": df_fd,
    }

    results = {
        "Library": [],
        "Read Time (s)": [],
        "Drop NaN Time (s)": [],
        "Filter Time (s)": [],
        "Sort Time (s)": [],
    }

    for name, df in libraries.items():
        read_time = time_function(lambda: load_data())
        results["Library"].append(name)
        results["Read Time (s)"].append(read_time)

        dropna_time = time_function(lambda: drop_na(df))
        results["Drop NaN Time (s)"].append(dropna_time)

        filter_time = time_function(lambda: filter_data(df))
        results["Filter Time (s)"].append(filter_time)

        sort_time = time_function(lambda: sort_data(df))
        results["Sort Time (s)"].append(sort_time)

    df_results = pd.DataFrame(results)
    return df_results

if __name__ == "__main__":
    results_df = benchmark()
    print(results_df)


     Library  Read Time (s)  Drop NaN Time (s)  Filter Time (s)  Sort Time (s)
0     Pandas       1.359096           0.020782         0.013297       0.176111
1     Polars       2.882995           0.006110         0.006710       0.231632
2      Modin       2.051264           0.030071         0.011369       0.157387
3  FireDucks       1.382198           0.000579         0.000554       0.000442
