# Testing the apply method

In [1]:
try:
    import pandopt as pdo
except:
    import sys, os
    sys.path.append('/'.join(os.getcwd().split('/')[:-2]))
    import pandopt as pdo
import pandas as pd
import numpy as np
import tqdm 
import pandas as pd
import numpy as np
import timeit
import functools
import time
import plotly.express as px
from typing import Callable, Dict
import polars as pl
from IPython.display import display

ModuleNotFoundError: No module named 'pandopt.transfcoders'

In [None]:
# Example functions
def simple_start(z):
    x = (z['A'] + z['B']) / z['C']
    x += z['B'] * z['D']
    return x / z['B']

def harder_func(z):
    x = (z['A'] + z['B']) / z['C']
    if x > 0:
        return x / z['B']
    x += z['B'] * z['D']
    return x * z['B']

def harder2_func(z):
    x = (z['A'] + z['B']) / z['C']
    if (k:=z['A']-z['C']) > (j:=z['B']/z['D']):
        return x / k
    x *= j
    return x - k if k > z['C'] else x + k


def harder3_func(z):
    g=lambda a, b: a if abs(a) > abs(b) else - 2 * (b**(-a))
    x = (z['A'] + z['B']) / z['C']
    if (k:=z['A']-z['C']) > (j:=g(z['B'],z['D'])):
        return j / k
    x *= j
    return x - k if k > z['C'] else x + k

def agg_sum(z):
    return np.sum(z)

## Basic test and work setup

In [None]:
def timeit(method_func, df, method_name, check_metric):
    if method_func is False:
        print(f'{method_name:<15} {'NA':<15} {'NA':<15} {'Discarded':<10}')
        return 
    t1 = time.time()
    result = method_func(df)
    result_np = result.to_numpy() if hasattr(result, 'to_numpy') else result
    t2 = time.time()
    execution_time = t2 - t1

    if isinstance(result, pd.Series):
        result_shape_str = f"{result.shape[0]} x 1"  
    else:
        result_shape_str = " x ".join(map(str, result.shape)) if hasattr(result, 'shape') else "N/A"
    
    metric = check_metric(result.to_numpy() if hasattr(result, 'to_numpy') else result)
    try:
        print(f'{method_name:<15} {result_shape_str:<15} {metric:<15.5f} {execution_time:<10.5f}')
    except:
        print(f'{method_name:<15} {result_shape_str:<15} {metric:<15} {execution_time:<10}')
    return execution_time

def format_number(num):
    """
    Convert a numerical value into a human-readable format,
    adding suffixes like K, M, B, etc.
    """
    for unit in ['', 'K', 'M', 'B', 'T']:
        if abs(num) < 1000:
            return f"{num:3.0f}{unit}"
        num /= 1000.0
    return f"{num:.1f}T"


def run_compare(methods: Dict[str, Callable], x_seconds: float = 1.5, start_ten_exponent: int = 1, check_metric: Callable = lambda x: (1 + np.abs(np.max(x) - np.min(x))) / np.median(x)):
    execution_times = {key: {} for key in methods}
    ten_exponent = start_ten_exponent
    while ten_exponent < 10: # Over than 9 become more SSD/RAM  - lazy loading options tes
        df_size = int(10**ten_exponent)
        formatted_size = format_number(df_size)
    
        print(f'\ntesting {format_number(df_size)} rows')
        print(f'{"method_name":<15} {"result shape":<15} {"compare metric":<15} {"execution_time":<10}')
    
        pandas_df = pd.DataFrame(np.random.randn(df_size, 4), columns=['A', 'B', 'C', 'D']).astype(np.float32)
    
        iteration_discards = []
        for method_name, method_func in methods.items():
            #Last is to avoid removing pandaopt at first iter due to compilation time
            if (e_time:=timeit(method_func, pandas_df, method_name, check_metric)) and e_time > x_seconds and ten_exponent > start_ten_exponent: 
                iteration_discards.append(method_name)
            execution_times[method_name][df_size] = e_time
                
        for method_name in iteration_discards:
            methods[method_name] = False
        
        # Break the loop if all methods are over the time limit
        if not methods:
            print("All methods exceeded the time limit. Ending tests.")
            break
        ten_exponent += 1
    
    results = pd.DataFrame(execution_times)
    results_long = results.reset_index().melt(id_vars=['index'], var_name='Method', value_name='Execution Time')
    results_long.rename(columns={'index': 'Rows'}, inplace=True)
    fig = px.line(results_long, x='Rows', y='Execution Time', color='Method', log_x=True, log_y=True, title='Execution Times by Method')
    fig.show()
    
    display(results)
    return results

## Pandopt Apply basic numpy sum vs Alternatives

### Sum of columns per row

In [None]:
# Define test methods
def pandas_apply(df):
    return df.apply(agg_sum, axis=1)

def pandas_sum(df):
    return df.sum(axis=1)

def numpy_sum(df):
    return np.sum(df.to_numpy(), axis=1)

def polars_sum(df):
    polars_df = pl.DataFrame(df)
    return polars_df.sum_horizontal()

def pandopt_apply(df):
    return pdo.DataFrame(df).apply(agg_sum, axis=1)  

methods = {
    'pandas_apply': pandas_apply,
    'pandas_sum': pandas_sum,
    'numpy_sum': numpy_sum,
    'polars_sum': polars_sum,
    'pandopt_apply': pandopt_apply, 
}

run_compare(methods, x_seconds = 1.5, start_ten_exponent = 2)

### Sum of row per columns

In [None]:
# Define test methods
def pandas_apply(df):
    return df.apply(agg_sum, axis=0)

def pandas_sum(df):
    return df.sum(axis=0)

def numpy_sum(df):
    return np.sum(df.to_numpy(), axis=0)

def polars_sum(df):
    polars_df = pl.DataFrame(df)
    return polars_df.sum()

def pandopt_apply(df):
    return pdo.DataFrame(df).apply(agg_sum, axis=0)  #

methods = {
    'pandas_apply': pandas_apply,
    'pandas_sum': pandas_sum,
    'numpy_sum': numpy_sum,
    'polars_sum': polars_sum,
    'pandopt_apply': pandopt_apply, 
}

run_compare(methods, x_seconds = 1.5)

### Total sum

In [None]:
# Define test methods
def pandas_apply(df):
    return df.apply(agg_sum).apply(agg_sum)

def pandas_sum(df):
    return df.sum().sum()

def numpy_sum(df):
    return np.sum(df.to_numpy(), axis=None)

def polars_sum(df):
    polars_df = pl.DataFrame(df)
    return polars_df.sum_horizontal().sum()

def pandopt_apply(df):
    return pdo.DataFrame(df).apply(agg_sum, axis=None)  

methods = {
    'pandas_apply': pandas_apply,
    'pandas_sum': pandas_sum,
    'numpy_sum': numpy_sum,
    'polars_sum': polars_sum,
    'pandopt_apply': pandopt_apply, 
}

run_compare(methods, x_seconds = 1.5, check_metric = lambda x: str(x))

## Wider test using more general Callables

In [None]:
def agg_sum(z):
    return np.sum(z)

def agg_mean(z):
    return np.mean(z)

def agg_max(z):
    return np.max(z)

def agg_min(z):
    return np.min(z)

def agg_std(z):
    return np.std(z)

def agg_sum(z):
    return np.sum(z)


def measure_performance(df, func, window_size=3):
    try:
        operation = functools.partial(df.apply, func, axis=1)
        start_time = timeit.default_timer()
        result = operation()
        elapsed_time = timeit.default_timer() - start_time
        return np.sum(result, axis=0), elapsed_time, None
    except Exception as e:
        return None, None, str(e)

def run_tests(max_ten_exponent, test_funcs, n_iter = 15):
    results = {}
    total_tests = max_ten_exponent  * len(test_funcs) * n_iter
    progress_bar = tqdm.tqdm(total=total_tests, desc="Running Tests", ncols=100)

    for test_num in range(1, max_ten_exponent + 1):
        df_size = int(10**test_num)
        for func in test_funcs:
            for test_iter in range(n_iter):
                pandas_df = pd.DataFrame(np.random.randn(df_size, 4), columns=['A', 'B', 'C', 'D']).astype(np.float32)
                pandopt_df = pdo.DataFrame(pandas_df)
                pandas_checksum, pandas_time, pandas_error = measure_performance(pandas_df, func)
                pandopt_checksum, pandopt_time, pandopt_error = measure_performance(pandopt_df, func)

                key = f"Size: 10^{test_num}, Func: {func.__name__}, Test:- {test_iter}"
                results[key] = {
                    "Size":df_size,
                    "function": func.__name__, 
                    "test_iter": test_iter,
                    "Pandas Time (s)": pandas_time,
                    "Pandopt Time (s)": pandopt_time,
                    "Checksum Pandas": pandas_checksum,
                    "Checksum Pandopt": pandopt_checksum,
                    "Pandas Error": pandas_error,
                    "Pandopt Error": pandopt_error
                }

            progress_bar.update(1)

    progress_bar.close()
    return pd.DataFrame.from_dict(results, orient='index')

# Run the tests
test_functions = [simple_start, harder_func, harder2_func, harder3_func, agg_sum, agg_mean, agg_max, agg_min, agg_std] 
results_df = run_tests(max_ten_exponent=8, test_funcs=test_functions)
results_df

In [None]:
results_df['time reduction'] = results_df["Pandopt Time (s)"] / results_df["Pandas Time (s)"] - 1
results_df['performance multiplicator'] = results_df["Pandas Time (s)"] / results_df["Pandopt Time (s)"] - 1
results_df

In [None]:
results_df.to_csv('benchmark_apply.csv')

In [None]:
results_df['Pandas Error'].apply(lambda x: x is not None).sum(),results_df['Pandopt Error'].apply(lambda x: x is not None).sum()

In [None]:
results_df['Pandas Time (s)'].sum(),results_df['Pandopt Time (s)'].sum()

In [None]:
results_df.plot(x='Pandas Time (s)', y='Pandopt Time (s)', kind = 'scatter', backend='plotly')