# Memory Reduction
https://www.kaggle.com/code/andradaolteanu/zzz-good-night-sleep-with-80-memory-reduction/notebook

In [1]:
import os

import numpy as np
import pandas as pd
import cudf
import matplotlib.pyplot as plt

import gc
import torch

In [2]:
class CFG:
    input_dir = os.path.join("/kaggle", "input")
    output_dir = os.path.join("/kaggle", "working")
    competition_dir = os.path.join(input_dir, "child-mind-institute-detect-sleep-states")
    train_series = os.path.join(competition_dir, "train_series.parquet")
    train_event = os.path.join(competition_dir, "train_events.csv")
    test_series = os.path.join(competition_dir, "test_series.parquet")
    sample_sub = os.path.join(competition_dir, "sample_submission.csv")

# Event memory reduction

In [3]:
train_event = cudf.read_csv(CFG.train_event)

## series_id map

In [4]:
train_id_map = cudf.DataFrame({"series_id": train_event.series_id.unique(),
                               "id_map": train_event.series_id.unique().index})
train_id_map.id_map = train_id_map.id_map.astype(np.uint16)
train_event = train_event.merge(right=train_id_map, on="series_id").drop(columns="series_id")

In [5]:
train_event.night = train_event.night.astype(np.uint16)
# event relabeled
train_event.event = train_event.event.replace({'onset':'1', 'wakeup':'2'}).astype(np.uint8)
# step
train_event.step = train_event.step.astype(np.uint32)
# timestamp
train_event.timestamp = cudf.to_datetime(train_event.timestamp, format='%Y-%m-%d %H:%M:%S')

In [6]:
train_map_path = os.path.join(CFG.input_dir, "comp_train_id_map.parquet")
train_event_path = os.path.join(CFG.input_dir, "comp_train_event.parquet")

train_id_map.to_parquet(train_map_path, index=False)
train_event.to_parquet(train_event_path, index=False)

# Series memory reduction

In [7]:
series = cudf.read_parquet(CFG.train_series)

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /opt/conda/include/rmm/mr/device/cuda_memory_resource.hpp

In [None]:
series = series.merge(right=train_id_map, on="series_id")\
            .drop(columns="series_id")\
            .reset_index(drop=True)

TypeError: Can only merge Series or DataFrame objects, a <class 'cudf.core.dataframe.DataFrame'> was passed

In [None]:
series.step = series.step.astype(np.uint32)

## datetime encodeing

In [None]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

# Local Time converter
def to_date_time(x):
    import pandas as pd
    return pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S') # utc=True

def to_localize(t):
    import pandas as pd
    return t.tz_localize(None)

series["timestamp"] = series.timestamp.parallel_apply(to_date_time).parallel_apply(to_localize)

In [None]:
series_path = os.path.join(CFG.input_dir, "comp_train_series.parquet")
series.to_parquet(series_path, index=False)