# Preprocess for training

In [1]:
import os

import numpy as np
import pandas as pd
from pandarallel import pandarallel
import cudf
import matplotlib.pyplot as plt

import gc
import torch

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [3]:
class CFG:
    is_debug = False
    input_dir = os.path.join("/kaggle", "input")
    output_dir = os.path.join("/kaggle", "working")
    competition_dir = os.path.join(input_dir, "child-mind-institute-detect-sleep-states")
    train_series = os.path.join(competition_dir, "train_series.parquet")
    train_event = os.path.join(competition_dir, "train_events.csv")

    train_series_non_null = os.path.join(input_dir, "processed_train_nonull.parquet")

    test_series = os.path.join(competition_dir, "test_series.parquet")
    sample_sub = os.path.join(competition_dir, "sample_submission.csv")

In [4]:
train_series = pd.read_parquet(CFG.train_series_non_null)
len(train_series)
original_data_num = len(train_series)

if CFG.is_debug:
    train_series = train_series[:50000]

In [5]:
train_series.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,event,event_onset,event_wakeup
0,08db4255286f,0,2018-11-05T10:00:00-0400,-30.845301,0.0447,0.0,0,0
1,08db4255286f,1,2018-11-05T10:00:05-0400,-34.181801,0.0443,0.0,0,0
2,08db4255286f,2,2018-11-05T10:00:10-0400,-33.877102,0.0483,0.0,0,0
3,08db4255286f,3,2018-11-05T10:00:15-0400,-34.282101,0.068,0.0,0,0
4,08db4255286f,4,2018-11-05T10:00:20-0400,-34.385799,0.0768,0.0,0,0


In [6]:
train_series.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14329117 entries, 0 to 14329116
Data columns (total 8 columns):
 #   Column        Dtype  
---  ------        -----  
 0   series_id     object 
 1   step          uint32 
 2   timestamp     object 
 3   anglez        float32
 4   enmo          float32
 5   event         float64
 6   event_onset   int64  
 7   event_wakeup  int64  
dtypes: float32(2), float64(1), int64(2), object(2), uint32(1)
memory usage: 710.6+ MB


In [7]:
pandarallel.initialize(progress_bar=True)
# Local Time converter
def to_date_time(x):
    return pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S') # utc=True

def to_localize(t):
    return t.tz_localize(None)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [8]:
train_series["timestamp"] = train_series.timestamp.parallel_apply(to_date_time).parallel_apply(to_localize)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1194094), Label(value='0 / 1194094…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1194094), Label(value='0 / 1194094…

In [9]:
# dfのtimestampから日付を取得
train_series["date"] = train_series["timestamp"].dt.date

In [10]:
# dfのtimestampから時刻を取得
train_series["time"] = train_series["timestamp"].dt.time

In [11]:
train_series["series_date_key"] = train_series["series_id"].astype(str) + "_" + train_series["date"].astype(str)

In [12]:
# train_series["series_date_key"].value_counts()

08db4255286f_2018-11-06    17280
08db4255286f_2018-11-07    17280
08db4255286f_2018-11-05    10080
08db4255286f_2018-11-08     5360
Name: data_key, dtype: int64

![Alt text](image.png)

![Alt text](image-1.png)

In [13]:
train_series.columns

Index(['series_id', 'step', 'timestamp', 'anglez', 'enmo', 'event',
       'event_onset', 'event_wakeup', 'date', 'time', 'series_date_key'],
      dtype='object')

In [14]:
save_series = train_series[['series_id', 'series_date_key', 'timestamp', 'date', 'time', 'step','anglez', 'enmo', 'event', "event_onset", "event_wakeup"]]

In [15]:
save_series.to_parquet(os.path.join(CFG.input_dir, "processed_train_withkey_nonull.parquet"))

In [16]:
len(save_series)

14329117

In [17]:
save_series

Unnamed: 0,series_id,series_date_key,timestamp,date,time,step,anglez,enmo,event,event_onset,event_wakeup
0,08db4255286f,08db4255286f_2018-11-05,2018-11-05 10:00:00,2018-11-05,10:00:00,0,-30.845301,0.0447,0.0,0,0
1,08db4255286f,08db4255286f_2018-11-05,2018-11-05 10:00:05,2018-11-05,10:00:05,1,-34.181801,0.0443,0.0,0,0
2,08db4255286f,08db4255286f_2018-11-05,2018-11-05 10:00:10,2018-11-05,10:00:10,2,-33.877102,0.0483,0.0,0,0
3,08db4255286f,08db4255286f_2018-11-05,2018-11-05 10:00:15,2018-11-05,10:00:15,3,-34.282101,0.0680,0.0,0,0
4,08db4255286f,08db4255286f_2018-11-05,2018-11-05 10:00:20,2018-11-05,10:00:20,4,-34.385799,0.0768,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
14329112,d5e47b94477e,d5e47b94477e_2017-12-02,2017-12-02 07:47:40,2017-12-02,07:47:40,390092,23.911301,0.0000,1.0,0,0
14329113,d5e47b94477e,d5e47b94477e_2017-12-02,2017-12-02 07:47:45,2017-12-02,07:47:45,390093,23.911301,0.0000,1.0,0,0
14329114,d5e47b94477e,d5e47b94477e_2017-12-02,2017-12-02 07:47:50,2017-12-02,07:47:50,390094,23.880699,0.0000,1.0,0,0
14329115,d5e47b94477e,d5e47b94477e_2017-12-02,2017-12-02 07:47:55,2017-12-02,07:47:55,390095,23.940100,0.0000,1.0,0,0


In [18]:
datakey_unique = pd.DataFrame({
    "series_date_key": save_series["series_date_key"].unique()
})
display(datakey_unique)

Unnamed: 0,series_date_key
0,08db4255286f_2018-11-05
1,08db4255286f_2018-11-06
2,08db4255286f_2018-11-07
3,08db4255286f_2018-11-08
4,08db4255286f_2018-11-09
...,...
866,d5e47b94477e_2017-11-28
867,d5e47b94477e_2017-11-29
868,d5e47b94477e_2017-11-30
869,d5e47b94477e_2017-12-01


In [19]:
datakey_unique["series_id"], datakey_unique["date"] = datakey_unique["series_date_key"].str.split("_", 1).str

  datakey_unique["series_id"], datakey_unique["date"] = datakey_unique["series_date_key"].str.split("_", 1).str
  datakey_unique["series_id"], datakey_unique["date"] = datakey_unique["series_date_key"].str.split("_", 1).str


In [20]:
datakey_unique

Unnamed: 0,series_date_key,series_id,date
0,08db4255286f_2018-11-05,08db4255286f,2018-11-05
1,08db4255286f_2018-11-06,08db4255286f,2018-11-06
2,08db4255286f_2018-11-07,08db4255286f,2018-11-07
3,08db4255286f_2018-11-08,08db4255286f,2018-11-08
4,08db4255286f_2018-11-09,08db4255286f,2018-11-09
...,...,...,...
866,d5e47b94477e_2017-11-28,d5e47b94477e,2017-11-28
867,d5e47b94477e_2017-11-29,d5e47b94477e,2017-11-29
868,d5e47b94477e_2017-11-30,d5e47b94477e,2017-11-30
869,d5e47b94477e_2017-12-01,d5e47b94477e,2017-12-01


In [21]:
len(datakey_unique)

871

In [22]:
pd.DataFrame(datakey_unique).to_csv(os.path.join(CFG.input_dir, "datakey_unique_nonull.csv"), index=False)