In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import random
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import polars as pl

# The Data
This dataset contains the measured data of multiple substations for voltage and current.

In [None]:
np.random.seed(42)
random.seed(42)

# Number of rows in the CSV file
n = 4_000_000  # Replace with the desired number of rows

timestamps = [datetime.now() - timedelta(minutes=i) for i in range(n)]
substation_ids = [random.randint(1, 10) for _ in range(n)]
voltages = np.random.normal(230, 10, n)  # Normal distribution with mean 230 and standard deviation 10
currents = np.random.normal(5, 2, n)  # Normal distribution with mean 5 and standard deviation 2


df = pd.DataFrame({
    'timestamp': timestamps,
    'substation_id': substation_ids,
    'voltage': voltages,
    'current': currents
})

# Write the DataFrame to a CSV file
df.to_csv("electricity_usage.csv", index=False)
df.head()

# Pandas version

In [None]:
%%timeit
df = pd.read_csv("electricity_usage.csv")

# Step 2: Perform some transformations
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index(df['timestamp'])

# Calculate power (P=VI) and add it as a new column
df['power'] = df['voltage'] * df['current']

# Group by 'substation_id' and calculate daily average power
df_grouped = df.groupby('substation_id').resample("D")["power"].mean()
df_grouped = df_grouped.reset_index()

# Filter out data where daily average power is less than a certain threshold
threshold = 1000
df_grouped = df_grouped[df_grouped['power'] > threshold]

df_grouped = df_grouped.sort_values(by=["substation_id", "timestamp"])

# Step 3: Write the transformed data to a new CSV file
df_grouped.to_csv("transformed_electricity_usage_pandas.csv", index=False)

# One possible solution with polars for **part 1**

In [None]:
%%timeit
df = pl.read_csv("electricity_usage.csv")

# Convert 'timestamp' from string to datetime
df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S%.f"))

# calculate power outcome
df = df.with_columns((pl.col("voltage") * pl.col("current")).alias("power"))

# Group by 'substation_id' and calculate daily average power
df = df.sort("timestamp")  # data has to be sorted for group_by_dynamic!
df_grouped = df.group_by_dynamic(
    index_column="timestamp",
    every="1d",
    closed="right",
    by="substation_id",
    include_boundaries=False,
).agg(pl.col("power").mean().alias("daily_avg_power"))

# Filter data where daily average power is less than a certain threshold
threshold = 1000
df_grouped = df_grouped.filter(pl.col("daily_avg_power") > threshold)

df_grouped = df_grouped.sort(["substation_id", "timestamp"])

# Write the transformed data to a new CSV file
df_grouped.write_csv("transformed_electricity_usage.csv")

### Solution part 1 with group_by instead of dynamic_group_by
Instead of the group_by_dynamic a regular group_by can also be used.  
Because of missing resample functionality a column with dates for grouping has to be introduced.

In [None]:
df = pl.read_csv("electricity_usage.csv")

# Use pl.Date instead of pl.Datetime for group_by on dates directly
df = df.with_columns(pl.col("timestamp").str.strptime(pl.Date, "%Y-%m-%d %H:%M:%S%.f").alias("date"))
# OR use the datetime column
# Convert 'timestamp' from string to datetime
# df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S%.f"))
# df = df.with_columns(pl.col("timestamp").cast(pl.Date).alias("date"))

# calculate power outcome
df = df.with_columns((pl.col("voltage") * pl.col("current")).alias("power"))

df_grouped = df.group_by(["date", "substation_id"]).agg(pl.col("power").mean().alias("daily_avg_power"))
df_grouped = df_grouped.sort(["substation_id", "date"])

# Solution Part 2 - Polars version as query with lazy mode

## Hands on Part 2 - Polars version as query with lazy mode
We now imagine that there is a larger than memory data set.  
Write the previous pipeline as a polars query (either use your code or the provided solution as base).

Execute the query in lazy mode and measure the run time with `%%timeit`

In [None]:
%%timeit
threshold = 1000
# Convert 'timestamp' from string to datetime
q = (
    pl.scan_csv("electricity_usage.csv")
    .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S%.f"))
    .with_columns((pl.col("voltage") * pl.col("current")).alias("power"))
    .sort("timestamp")
    .group_by_dynamic(
        index_column="timestamp",
        every="1d",
        closed="right",
        by="substation_id",
        include_boundaries=False,
    ).agg(pl.col("power").mean().alias("daily_avg_power"))
    .filter(pl.col("daily_avg_power") > threshold)
    .sort(["substation_id", "timestamp"])
)

# possibility to test the pipeline with reduced amount data
# df_test = q.fetch(n_rows=int(100))

# Step 3: Write the transformed data to a new CSV file
df = q.collect(streaming=True) # set streaming = True if the data might not fit into memory
df.write_csv("transformed_electricity_usage.csv")

# in case you want a pandas dataframe for plotting etc.
df_pandas = df.to_pandas()

---
_This notebook is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). Copyright Â© [Point 8 GmbH](https://point-8.de)_