In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import polars as pl
import datetime 
from tqdm import tqdm
from pathlib import Path

In [2]:
train_series = pl.scan_parquet('../train_series.parquet')
train_series

In [3]:
train_series_df = pl.read_parquet('../train_series.parquet')
train_series_df

series_id,step,timestamp,anglez,enmo
str,u32,str,f32,f32
"""038441c925bb""",0,"""2018-08-14T15:…",2.6367,0.0217
"""038441c925bb""",1,"""2018-08-14T15:…",2.6368,0.0215
"""038441c925bb""",2,"""2018-08-14T15:…",2.637,0.0216
"""038441c925bb""",3,"""2018-08-14T15:…",2.6368,0.0213
"""038441c925bb""",4,"""2018-08-14T15:…",2.6368,0.0215
"""038441c925bb""",5,"""2018-08-14T15:…",2.6367,0.0217
"""038441c925bb""",6,"""2018-08-14T15:…",2.6367,0.0217
"""038441c925bb""",7,"""2018-08-14T15:…",2.6367,0.0218
"""038441c925bb""",8,"""2018-08-14T15:…",2.798,0.0223
"""038441c925bb""",9,"""2018-08-14T15:…",3.0847,0.0217


In [5]:
train_events = pl.read_csv('../train_events.csv')
# train_events = train_events.drop_nulls()
train_events

series_id,night,event,step,timestamp
str,i64,str,i64,str
"""038441c925bb""",1,"""onset""",4992,"""2018-08-14T22:…"
"""038441c925bb""",1,"""wakeup""",10932,"""2018-08-15T06:…"
"""038441c925bb""",2,"""onset""",20244,"""2018-08-15T19:…"
"""038441c925bb""",2,"""wakeup""",27492,"""2018-08-16T05:…"
"""038441c925bb""",3,"""onset""",39996,"""2018-08-16T23:…"
"""038441c925bb""",3,"""wakeup""",44400,"""2018-08-17T05:…"
"""038441c925bb""",4,"""onset""",57240,"""2018-08-17T23:…"
"""038441c925bb""",4,"""wakeup""",62856,"""2018-08-18T06:…"
"""038441c925bb""",5,"""onset""",,
"""038441c925bb""",5,"""wakeup""",,


In [6]:
train_events2 = train_events.with_columns(
    pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z"),
)

train_events2

series_id,night,event,step,timestamp
str,i64,str,i64,"datetime[μs, UTC]"
"""038441c925bb""",1,"""onset""",4992,2018-08-15 02:26:00 UTC
"""038441c925bb""",1,"""wakeup""",10932,2018-08-15 10:41:00 UTC
"""038441c925bb""",2,"""onset""",20244,2018-08-15 23:37:00 UTC
"""038441c925bb""",2,"""wakeup""",27492,2018-08-16 09:41:00 UTC
"""038441c925bb""",3,"""onset""",39996,2018-08-17 03:03:00 UTC
"""038441c925bb""",3,"""wakeup""",44400,2018-08-17 09:10:00 UTC
"""038441c925bb""",4,"""onset""",57240,2018-08-18 03:00:00 UTC
"""038441c925bb""",4,"""wakeup""",62856,2018-08-18 10:48:00 UTC
"""038441c925bb""",5,"""onset""",,
"""038441c925bb""",5,"""wakeup""",,


In [7]:
# Define the schema and constants as per your script
SERIES_SCHEMA = {
    "series_id": pl.Utf8,
    "step": pl.Int64,
    "anglez": pl.Float32,
    "enmo": pl.Float32,
}

FEATURE_NAMES = [
    "anglez",
    "enmo",
    "step",
    "hour_sin",
    "hour_cos",
    "month_sin",
    "month_cos",
    "minute_sin",
    "minute_cos",
    "anglez_sin",
    "anglez_cos",
]

ANGLEZ_MEAN = -8.810476
ANGLEZ_STD = 35.521877
ENMO_MEAN = 0.041315
ENMO_STD = 0.101829

# Define the feature engineering functions
def to_coord(x: pl.Expr, max_: int, name: str) -> list[pl.Expr]:
    rad = 2 * np.pi * (x % max_) / max_
    x_sin = rad.sin()
    x_cos = rad.cos()
    return [x_sin.alias(f"{name}_sin"), x_cos.alias(f"{name}_cos")]

def deg_to_rad(x: pl.Expr) -> pl.Expr:
    return np.pi / 180 * x

def add_feature(series_df: pl.DataFrame) -> pl.DataFrame:
    series_df = (
        series_df.with_row_count("step")
        .with_columns(
            *to_coord(pl.col("timestamp").dt.hour(), 24, "hour"),
            *to_coord(pl.col("timestamp").dt.month(), 12, "month"),
            *to_coord(pl.col("timestamp").dt.minute(), 60, "minute"),
            pl.col("step") / pl.count("step"),
            pl.col('anglez_rad').sin().alias('anglez_sin'),
            pl.col('anglez_rad').cos().alias('anglez_cos'),
        )
        .select("series_id", *FEATURE_NAMES)
    )
    return series_df

def save_each_series(this_series_df: pl.DataFrame, columns: list[str], output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)
    for col_name in columns:
        x = this_series_df.get_column(col_name).to_numpy(zero_copy_only=True)
        np.save(output_dir / f"{col_name}.npy", x)

In [10]:
# Process the train_series dataset
train_series_df2 = (
    train_series.with_columns(
        pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z"),
        # pl.col("timestamp").str.to_datetime(),
        deg_to_rad(pl.col("anglez")).alias("anglez_rad"),
        (pl.col("anglez") - ANGLEZ_MEAN) / ANGLEZ_STD,
        (pl.col("enmo") - ENMO_MEAN) / ENMO_STD,
    )
    .select(
        [
            pl.col("series_id"),
            pl.col("step"),
            pl.col("timestamp"),
            pl.col("anglez"),
            pl.col("enmo"),
            pl.col("anglez_rad"),
        ]
    )
    .collect(streaming=True)
    .sort(by=["series_id", "timestamp"])
)

In [11]:
for column_name, polars_dtype in SERIES_SCHEMA.items():
    train_series_df2 = train_series_df2.with_columns(
        pl.col(column_name).cast(polars_dtype).alias(column_name)
    )

In [21]:
merged_df = train_series_df2.join(
    train_events2.drop(columns=['night']), 
    on=['series_id', 'step', 'timestamp'], 
    how='outer'
)

In [7]:
train_series_df3 = add_feature(train_series_df2)
train_series_df3.head()

series_id,anglez,enmo,step,hour_sin,hour_cos,month_sin,month_cos,minute_sin,minute_cos,anglez_sin,anglez_cos
str,f32,f32,f64,f64,f64,f64,f64,f64,f64,f32,f32
"""038441c925bb""",0.322257,-0.192627,0.0,-0.965926,0.258819,-0.866025,-0.5,5.6655e-16,-1.0,0.046003,0.998941
"""038441c925bb""",0.32226,-0.194591,7.8158e-09,-0.965926,0.258819,-0.866025,-0.5,5.6655e-16,-1.0,0.046005,0.998941
"""038441c925bb""",0.322266,-0.193609,1.5632e-08,-0.965926,0.258819,-0.866025,-0.5,5.6655e-16,-1.0,0.046008,0.998941
"""038441c925bb""",0.32226,-0.196555,2.3447e-08,-0.965926,0.258819,-0.866025,-0.5,5.6655e-16,-1.0,0.046005,0.998941
"""038441c925bb""",0.32226,-0.194591,3.1263e-08,-0.965926,0.258819,-0.866025,-0.5,5.6655e-16,-1.0,0.046005,0.998941
