# Measure the execution time of the different implementations in `pandas` and `polars`
## A little (not complete) benchmark

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import random
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import polars as pl

# Generate larger data set

In [None]:
np.random.seed(42)
random.seed(42)

# Number of rows in the CSV file
n = 4_000_000  # Replace with the desired number of rows

timestamps = [datetime.now() - timedelta(minutes=i) for i in range(n)]
substation_ids = [random.randint(1, 10) for _ in range(n)]
voltages = np.random.normal(
    230, 10, n
)  # Normal distribution with mean 230 and standard deviation 10
currents = np.random.normal(
    5, 2, n
)  # Normal distribution with mean 5 and standard deviation 2


df = pd.DataFrame(
    {
        "timestamp": timestamps,
        "substation_id": substation_ids,
        "voltage": voltages,
        "current": currents,
    }
)

# Write the DataFrame to a CSV file
df.to_csv("electricity_usage.csv", index=False)
df.head()

# Pandas version

In [None]:
%%timeit
# Step 1: Read data from csv
df = pl.read_csv("electricity_usage.csv")

# Step 2: Set the correct timestamp format
df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S%.f"))

# Step 3: Calculate power (P=VI) and add it as a new column
df = df.with_columns((pl.col("voltage") * pl.col("current")).alias("power"))

# Step 4: Group by 'substation_id', resample timestamp and calculate daily average power
df = df.sort("timestamp")  # data has to be sorted for group_by_dynamic!
df_grouped = df.group_by_dynamic(
    index_column="timestamp",
    every="1d",
    closed="right",
    by="substation_id",
    include_boundaries=False,
).agg(pl.col("power").mean().alias("daily_avg_power"))

# Step 5: Filter out data where daily average power is less than a certain threshold
threshold = 1000
df_grouped = df_grouped.filter(pl.col("daily_avg_power") > threshold)

# Step 6: Sort result by substation and timestamp
df_grouped = df_grouped.sort(["substation_id", "timestamp"])

# Step 7: Write the transformed data to a new CSV file
df_grouped.write_csv("transformed_electricity_usage.csv")

# Polars version with eager mode

In [None]:
%%timeit
df = pl.read_csv("electricity_usage.csv")

# Convert 'timestamp' from string to datetime
df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S%.f"))

# calculate power outcome
df = df.with_columns((pl.col("voltage") * pl.col("current")).alias("power"))

# Group by 'substation_id' and calculate daily average power
df = df.sort("timestamp")  # data has to be sorted for group_by_dynamic!
df_grouped = df.group_by_dynamic(
    index_column="timestamp",
    every="1d",
    closed="right",
    by="substation_id",
    include_boundaries=False,
).agg(pl.col("power").mean().alias("daily_avg_power"))

# Filter data where daily average power is less than a certain threshold
threshold = 1000
df_grouped = df_grouped.filter(pl.col("daily_avg_power") > threshold)

df_grouped = df_grouped.sort(["substation_id", "timestamp"])

# Write the transformed data to a new CSV file
df_grouped.write_csv("transformed_electricity_usage.csv")

# Polars version with lazy mode and query

In [None]:
%%timeit
threshold = 1000

# Define lazy query/pipeline
q = (
    pl.scan_csv("electricity_usage.csv")
    .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S%.f"))
    .with_columns((pl.col("voltage") * pl.col("current")).alias("power"))
    .sort("timestamp")
    .group_by_dynamic(
        index_column="timestamp",
        every="1d",
        closed="right",
        by="substation_id",
        include_boundaries=False,
    ).agg(pl.col("power").mean().alias("daily_avg_power"))
    .filter(pl.col("daily_avg_power") > threshold)
    .sort(["substation_id", "timestamp"])
)

# possibility to test the pipeline with reduced amount data
# df_test = q.fetch(n_rows=int(100))

# Collect the data
df = q.collect(streaming=True) # set streaming = True if the data might not fit into memory

# Write the transformed data to a new CSV file
df.write_csv("transformed_electricity_usage.csv")

---
_This notebook is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). Copyright © [Point 8 GmbH](https://point-8.de)_