In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import random
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import polars as pl

# The Data
This dataset contains the measured data of multiple substations for voltage and current.

In [None]:
np.random.seed(42)
random.seed(42)

# Number of rows in the CSV file
n = 1_000_000  # Replace with the desired number of rows

timestamps = [datetime.now() - timedelta(minutes=i) for i in range(n)]
substation_ids = [random.randint(1, 10) for _ in range(n)]
voltages = np.random.normal(230, 10, n)  # Normal distribution with mean 230 and standard deviation 10
currents = np.random.normal(5, 2, n)  # Normal distribution with mean 5 and standard deviation 2


df = pd.DataFrame({
    'timestamp': timestamps,
    'substation_id': substation_ids,
    'voltage': voltages,
    'current': currents
})

# Write the DataFrame to a CSV file
df.to_csv("electricity_usage.csv", index=False)
df.head()

# Pandas version
Execution time: ~780 ms

In [None]:
df

In [None]:
df = pd.read_csv("electricity_usage.csv")

# Step 2: Perform some transformations
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index(df['timestamp'])

# Calculate power (P=VI) and add it as a new column
df['power'] = df['voltage'] * df['current']

# Group by 'substation_id' and calculate daily average power
df_grouped = df.groupby('substation_id').resample("D")["power"].mean()
df_grouped = df_grouped.reset_index()

# Filter out data where daily average power is less than a certain threshold
threshold = 1000
df_grouped = df_grouped[df_grouped['power'] > threshold]

df_grouped = df_grouped.sort_values(by=["substation_id", "timestamp"])

# Write the transformed data to a new CSV file
df_grouped.to_csv("transformed_electricity_usage_pandas.csv", index=False)

# Hands on exercise:
Create the "data pipeline" using the polars library.
The pipeline should include the following steps:
- Read data csv
- Convert the date string to a datetime format
- Calculate the power
- Get the power for each substation per day
- Remove all entries with a power value less than 1000
- Sort the result by substation and date
- write the result in a csv file

Compare run times with `%%timeit`  
Hint: Polars has a `group_by_dynamic` method    
Of course the online polars documentation can be used.

---
_This notebook is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). Copyright © [Point 8 GmbH](https://point-8.de)_