### Reading the data

In [None]:
import pandas as pd

df = pd.read_csv(r'.\test\groupby-N_10000000_K_100_file_0.csv')

df.head()

###  Create a timing function

In [None]:
import time
import pandas as pd
import dask.dataframe as dd

def perform_test(df):
    ''' Perform the desired operation on the DataFrame
    and return the execution time'''
    
    if isinstance(df, pd.DataFrame):
        # Perform operation using Pandas
        start_time = time.time()
        # Perform the desired operation on the Pandas DataFrame
        # For example, grouping the DataFrame by 'id1' and summing 'v1'
        result = df.groupby('id1')['v1'].sum()
        end_time = time.time()
        execution_time = end_time - start_time
    elif isinstance(df, dd.DataFrame):
        # Perform operation using Dask
        start_time = time.time()
        # Perform the desired operation on the Dask DataFrame
        # For example, grouping the DataFrame by 'id1' and summing 'v1'
        result = df.groupby('id1')['v1'].sum().compute()
        end_time = time.time()
        execution_time = end_time - start_time

    return execution_time


This function checks the type of the input DataFrame (Pandas or Dask) using isinstance and performs the operation accordingly. For Dask DataFrames, the compute() method is used to trigger the computation

## optimizing the performance of a Dask DataFrame

### Step 1: Setting a Dask baseline

In [None]:
import dask.dataframe as dd

dtypes = {
    "id1": "object",
    "id2": "object",
    "id3": "object",
    "id4": "object",
    "id5": "object",
    "id6": "object",
    "v1": "int64",
    "v2": "object",
    "v3": "object",
}

df_dask = dd.read_csv(r'.\test\groupby-N_10000000_K_100_file_0.csv', dtype=dtypes)


### Step 2: Avoiding object columns

Change the data types of columns 'id1', 'id2', and 'id3' to string[pyarrow] type, and 'v3' to float64. The rest of the columns can be considered as int64.

In [None]:
import dask.dataframe as dd

dtypes = {
    "id1": "string[pyarrow]",
    "id2": "string[pyarrow]",
    "id3": "string[pyarrow]",
    "id4": "int64",
    "id5": "int64",
    "id6": "int64",
    "v1": "int64",
    "v2": "int64",
    "v3": "float64",
}

df_dask = dd.read_csv(r'.\test\groupby-N_10000000_K_100_file_0.csv', dtype=dtypes)


In [None]:
import dask.dataframe as dd

# Repartition the data into multiple files
df_dask = df_dask.repartition(partition_size='100MB')

# Check the test directory to see the newly created files
# You can use the glob function to get a list of the file paths
import glob
file_paths = glob.glob(r'.\test\groupby-N_10000000_K_100_file_0.part*')

# Create a Dask DataFrame from the multiple files
df_dask = dd.read_csv(file_paths, dtype=dtypes)

# Rerun the perform_test function on the updated Dask DataFrame
execution_time = perform_test(df_dask)


### step 4a: parquet instead of csv

In [None]:
df_dask.to_parquet('test/groupby.parquet', compression=None, engine='pyarrow')
