# Libraries

In [34]:
import gc
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from colorama import Fore, Style, init
from pprint import pprint

# 🚫 Suppressing warnings 🚫
import warnings
warnings.filterwarnings('ignore')

In [35]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.dates as mdates
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

In [36]:
from scipy.stats import entropy
from collections import Counter
import polars as pl

In [37]:
from joblib import Parallel, delayed
from time import sleep, time
from multiprocessing import cpu_count

In [38]:
from tqdm.auto import tqdm 

# Reading train_series parquet

In [39]:
%%time
# inspired by https://www.kaggle.com/code/enricomanosperti/detect-sleep-states-first-preprocessing-and-eda
import polars as pl
train_series = (pl.scan_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

CPU times: user 1min 38s, sys: 53.9 s, total: 2min 32s
Wall time: 1min 21s


# Reading train_events - use to get the labels

In [40]:
%%time
train_events = (pl.scan_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

CPU times: user 26.4 ms, sys: 67.7 ms, total: 94.1 ms
Wall time: 97.3 ms


# Reading test_series - use to get predictions and upload solution

In [41]:
%%time
test_series = (pl.scan_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

CPU times: user 5.83 ms, sys: 3.88 ms, total: 9.72 ms
Wall time: 21.3 ms


In [42]:
%%time
# inspired by https://www.kaggle.com/code/renatoreggiani/reduce-memory-usage-zzzs-cmi
# with tweaks determined by the selected polars loading strategy
# tweaks inspired by https://github.com/softhints/Pandas-Tutorials/blob/master/column/3.check-dtype-column-columns-pandas-dataframe.ipynb
from pandas.api.types import is_datetime64_ns_dtype
def reduce_mem_usage(df):
    """ iterate through all numeric columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and not is_datetime64_ns_dtype(df[col]):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float16)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    decrease = 100 * (start_mem - end_mem) / start_mem
    print(f'Decreased by {decrease:.2f}%')
    
    return df

train_series = reduce_mem_usage(train_series)

Memory usage of dataframe is 5368.84 MB
Memory usage after optimization is: 3904.61 MB
Decreased by 27.27%
CPU times: user 5.41 s, sys: 1.79 s, total: 7.2 s
Wall time: 6.77 s


In [43]:
%%time
test_series = reduce_mem_usage(test_series)

Memory usage of dataframe is 0.02 MB
Memory usage after optimization is: 0.01 MB
Decreased by 31.61%
CPU times: user 10.3 ms, sys: 69 µs, total: 10.4 ms
Wall time: 9.86 ms


# remove the nights of which we don't have any event reported.
# also print a new train_series parquet and a new train_events parquet that excludes those nights


In [44]:
train_series.groupby(['series_id'])['step'].size()

series_id
038441c925bb    389880
03d92c9f6f8a    724140
0402a003dae9    397260
04f547b8017d    637560
05e1944c3818    400860
                 ...  
fa149c3c4bde    406800
fb223ed2278c    918360
fbf33b1a2c10    421020
fcca183903b7    620640
fe90110788d2    592380
Name: step, Length: 277, dtype: int64

In [45]:
train_events.groupby(['series_id'])['step'].size()

series_id
038441c925bb     46
03d92c9f6f8a     74
0402a003dae9     48
04f547b8017d     74
05e1944c3818     16
               ... 
fa149c3c4bde     48
fb223ed2278c    106
fbf33b1a2c10     48
fcca183903b7     72
fe90110788d2     70
Name: step, Length: 277, dtype: int64

# Function to merge train series with train events to get the label and remove those nights without data

In [46]:
def get_series_data(series_id):
    
    df=train_series[(train_series['series_id']==series_id)][['series_id', 'step', 'timestamp', 'anglez', 'enmo', 'year', 'month',
       'day', 'hour']]
    df['date_time'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    #df = df.drop('timestamp', axis=1)
    
    ev=train_events[(train_events['series_id']==series_id)][['series_id','night','event','step','timestamp']]
    ev = ev[ev['step'].notna()]
    ev['date_time'] = pd.to_datetime(ev['timestamp']).apply(lambda t: t.tz_localize(None))
    ev = ev.drop('timestamp', axis=1)
    
    df2 = pd.merge(df, ev, on='date_time', how='outer')
    
    del df
    del ev
    
    df2['step']=df2['step_x']
    df2['series_id']=df2['series_id_x']
    df2 = df2.drop(['series_id_x','series_id_y','step_x','step_y'], axis=1)
    df2['event_raw'] = df2['event']
    df2['event'] = df2['event'].fillna(0)

    # fill values for night feature
    df2['night'][0] = 1
    df2['night'] = df2['night'].interpolate()
    df2['night'] = round(df2['night'])
    # map numeric values to event feature
    df2.loc[df2['event']=='onset','event'] = -1
    df2.loc[df2['event']=='wakeup','event'] = 1
    
    df2['year_month_day']=df2['year'].astype(str) + df2['month'].astype(str).str.zfill(2)+ df2['day'].astype(str).str.zfill(2)

    df3=df2.groupby('night').year_month_day.nunique().reset_index()
    #df3=df3[(df3['year_month_day']<3)]
   
    df4 = pd.merge(df2, df3, on='night', how='inner')
    
    del df2
    del df3
    
    df4 = df4.drop(['year_month_day_x','year_month_day_y'], axis=1)
    
    df5=df4.groupby(['night'])['event'].nunique().reset_index()
    df5=df5[(df5['event']==3)]
    df6 = pd.merge(df4, df5[['night']], on='night', how='inner')
    
    del df4
    del df5
    
    df6['day'] = df6['day'].apply(str)
    df6['month'] = df6['month'].apply(str)
    df6['year_month_day']=df6['year'].astype(str) + df6['month'].astype(str)+ df6['day'].astype(str)
    
    if len(df6)>0:
        df7=df6.groupby(['year','month','day'])['event'].nunique().reset_index()
        df7['year_month_day']=df7['year'].astype(str) + df7['month'].astype(str)+ df7['day'].astype(str)
        df7=df7[(df7['event']>1)]
        df8 = pd.merge(df6, df7[['year_month_day']], on='year_month_day', how='inner')

        df8['event_2']=df8['event_raw']
        df8['event_2'].fillna('Other', inplace=True)

        df8['event']=df8['event_raw']
        df8['event'].fillna(method='ffill', inplace=True)
        df8['event'].fillna('wakeup', inplace=True)

        df8=df8[['series_id', 'step', 'timestamp', 'anglez', 'enmo','hour','event','event_2','night']]
    else: 
        df8=df6
        
    del df6
    #del df7
    
    return df8

# Generate new train series, and new train events

In [47]:
len(train_series)

127946340

In [48]:
train_series.columns

Index(['series_id', 'step', 'timestamp', 'anglez', 'enmo', 'year', 'month',
       'day', 'hour'],
      dtype='object')

In [49]:
series_id = train_series.series_id.unique()
len(series_id)
#series_id

277

In [50]:
series_id2=['038441c925bb','03d92c9f6f8a']
series_id2

['038441c925bb', '03d92c9f6f8a']

In [51]:
%%time

import joblib

print("Number of jobs: ",int(cpu_count()))

train_series_new = []

#for idx in tqdm(series_id2):
for idx in tqdm(series_id): 

    test = get_series_data(idx)
    if len(test)>0:
        train_series_new.append(test)
        
    gc.collect()
    
#train_series_new

Number of jobs:  4


  0%|          | 0/277 [00:00<?, ?it/s]

CPU times: user 2h 28min 41s, sys: 1min 12s, total: 2h 29min 54s
Wall time: 2h 28min 16s


In [52]:
joblib.dump((train_series_new), 'train_series_10112023.pkl')
len(train_series_new)

269

In [53]:
train_series_new_df = pd.concat(train_series_new, ignore_index=True).reset_index(names='row_id')

train_series_new_df.astype('float32', errors='ignore').to_parquet('train_series_10112023.parquet',
                                                                  use_deprecated_int96_timestamps=True)

len(train_series_new_df)

84936878