In [3]:
import pandas as pd
import dask.dataframe as dd
import time
import numpy as np

# --- Data Generation ---
n_rows = 10_000_000
n_cols = 5
data = {
    f'col_{i}': np.random.rand(n_rows) for i in range(n_cols)
}
data['category'] = np.random.choice(['A', 'B', 'C', 'D'], n_rows)



In [6]:
# --- Pandas DataFrame ---
start_pandas = time.time()
pandas_df = pd.DataFrame(data)
end_pandas = time.time()
pandas_creation_time = end_pandas - start_pandas

In [7]:
# --- Dask DataFrame ---
start_dask = time.time()
dask_df = dd.from_pandas(pandas_df, npartitions=4)  # Create Dask DataFrame with 4 partitions
end_dask = time.time()
dask_creation_time = end_dask - start_dask

print(f"Pandas DataFrame Creation Time: {pandas_creation_time:.4f} seconds")
print(f"Dask DataFrame Creation Time: {dask_creation_time:.4f} seconds")

Pandas DataFrame Creation Time: 0.3519 seconds
Dask DataFrame Creation Time: 0.5300 seconds


In [9]:
# --- Pandas Aggregation ---
start_pandas_agg = time.time()
pandas_grouped = pandas_df.groupby('category')['col_0'].mean()
# pandas_result = pandas_grouped.compute() # compute the result for pandas.
# Pandas dont need compute
pandas_result = pandas_grouped
end_pandas_agg = time.time()
pandas_agg_time = end_pandas_agg - start_pandas_agg

In [10]:
# --- Dask Aggregation ---
start_dask_agg = time.time()
dask_grouped = dask_df.groupby('category')['col_0'].mean()
dask_result = dask_grouped.compute()
end_dask_agg = time.time()
dask_agg_time = end_dask_agg - start_dask_agg

print(f"Pandas Aggregation Time: {pandas_agg_time:.4f} seconds")
print(f"Dask Aggregation Time: {dask_agg_time:.4f} seconds")



Pandas Aggregation Time: 0.6099 seconds
Dask Aggregation Time: 0.7329 seconds
Pandas Filtering Time: 0.1611 seconds
Dask Filtering Time: 0.6978 seconds
Pandas Apply Time: 2.3105 seconds
Dask Apply Time: 2.8247 seconds
Pandas Sum Time: 0.0119 seconds
Dask Sum Time: 0.5834 seconds


In [13]:
# --- Pandas Filtering ---
start_pandas_filter = time.time()
pandas_filtered = pandas_df[pandas_df['col_1'] > 0.5]
pandas_filtered_count = len(pandas_filtered)
end_pandas_filter = time.time()
pandas_filter_time = end_pandas_filter - start_pandas_filter

# --- Dask Filtering ---
start_dask_filter = time.time()
dask_filtered = dask_df[dask_df['col_1'] > 0.5]
dask_filtered_count = dask_filtered.count().compute()
end_dask_filter = time.time()
dask_filter_time = end_dask_filter - start_dask_filter

print(f"Pandas Filtering Time: {pandas_filter_time:.4f} seconds")
print(f"Dask Filtering Time: {dask_filter_time:.4f} seconds")



Pandas Filtering Time: 0.2072 seconds
Dask Filtering Time: 0.6872 seconds


In [12]:
# --- Pandas Apply Function ---
def custom_function(x):
    return x * 2

start_pandas_apply = time.time()
pandas_applied = pandas_df['col_2'].apply(custom_function)
end_pandas_apply = time.time()
pandas_apply_time = end_pandas_apply - start_pandas_apply

# --- Dask Apply Function ---
start_dask_apply = time.time()
dask_applied = dask_df['col_2'].apply(custom_function, meta=('col_2', 'float64')).compute()
end_dask_apply = time.time()
dask_apply_time = end_dask_apply - start_dask_apply

print(f"Pandas Apply Time: {pandas_apply_time:.4f} seconds")
print(f"Dask Apply Time: {dask_apply_time:.4f} seconds")



Pandas Apply Time: 2.3684 seconds
Dask Apply Time: 2.9519 seconds


In [11]:
# --- Pandas Sum ---
start_pandas_sum = time.time()
pandas_sum = pandas_df['col_3'].sum()
end_pandas_sum = time.time()
pandas_sum_time = end_pandas_sum - start_pandas_sum

# --- Dask Sum ---
start_dask_sum = time.time()
dask_sum = dask_df['col_3'].sum().compute()
end_dask_sum = time.time()
dask_sum_time = end_dask_sum - start_dask_sum

print(f"Pandas Sum Time: {pandas_sum_time:.4f} seconds")
print(f"Dask Sum Time: {dask_sum_time:.4f} seconds")

Pandas Sum Time: 0.0246 seconds
Dask Sum Time: 0.5830 seconds
