# Installing and Importing and Initializing Libraries.

In [None]:
!pip install pandarallel

In [2]:
import numpy as np # linear algebra
import pandas as pd# data processing, CSV file I/O (e.g. pd.read_csv)
#from pandarallel import pandarallel
import plotly.express as px
pd.set_option('display.max_row', 500)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from tqdm import tqdm
tqdm.pandas()
#pandarallel.initialize(progress_bar=True)

/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet


# Reading the Datasets.

In [3]:
df_series=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet", engine='auto')

In [5]:
df_events=pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")
events=df_events[['series_id', 'step','event']]

# Merging the Datasets to train the model

In [8]:
series_df=pd.merge(df_series,events,on=["step","series_id"],how='left')
series_df['sleep']=np.nan
series_df.loc[series_df["event"]=="onset", "sleep"] = 1
series_df.loc[series_df["event"]=="wakeup", "sleep"] = 0
series_df['sleep'].fillna(method='ffill', inplace=True)
series_df['sleep'].fillna(value=0, inplace=True)
#df_series=[]

In [9]:
series_df

Unnamed: 0,series_id,step,timestamp,anglez,enmo,event,sleep
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.636700,0.0217,,0.0
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.636800,0.0215,,0.0
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637000,0.0216,,0.0
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.636800,0.0213,,0.0
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.636800,0.0215,,0.0
...,...,...,...,...,...,...,...
127946335,fe90110788d2,592375,2017-09-08T00:14:35-0400,-27.277500,0.0204,,0.0
127946336,fe90110788d2,592376,2017-09-08T00:14:40-0400,-27.032499,0.0233,,0.0
127946337,fe90110788d2,592377,2017-09-08T00:14:45-0400,-26.841200,0.0202,,0.0
127946338,fe90110788d2,592378,2017-09-08T00:14:50-0400,-26.723900,0.0199,,0.0


# Windowing the training data
### As the target is to detect an event, 
### So, extracting timesteps 30 min before and after the events. 

In [20]:
def window(df, win_size):
    ind=df.index[df['event'].isna()==False]
    c=0
    df['window']=np.nan
    for i in tqdm(ind):
        a=i-win_size
        b=i+win_size
        df['window'].loc[a:i]=int(c)
        c=c+1
        df['window'].loc[i:b]=int(c)
        c=c+1
    df['window'].dropna(inplace=True)
    return df[df['window'].isna()==False]
df_series=window(series_df,360)
#df_series=df_series[df_series['window'].isna()==False]

100%|██████████| 9585/9585 [00:10<00:00, 955.63it/s] 


In [24]:
df_series

Unnamed: 0,series_id,step,timestamp,anglez,enmo,event,sleep,window,cluster
4632,038441c925bb,4632,2018-08-14T21:56:00-0400,21.841999,0.0002,,0.0,0.0,0
4633,038441c925bb,4633,2018-08-14T21:56:05-0400,28.535900,0.0019,,0.0,0.0,0
4634,038441c925bb,4634,2018-08-14T21:56:10-0400,34.446499,0.0011,,0.0,0.0,0
4635,038441c925bb,4635,2018-08-14T21:56:15-0400,37.184200,0.0003,,0.0,0.0,0
4636,038441c925bb,4636,2018-08-14T21:56:20-0400,35.196499,0.0015,,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...
127935920,fe90110788d2,581960,2017-09-07T09:46:40-0400,-55.010601,0.0834,,0.0,19169.0,1
127935921,fe90110788d2,581961,2017-09-07T09:46:45-0400,-22.163601,0.1773,,0.0,19169.0,3
127935922,fe90110788d2,581962,2017-09-07T09:46:50-0400,7.853500,0.1868,,0.0,19169.0,3
127935923,fe90110788d2,581963,2017-09-07T09:46:55-0400,34.740799,0.0798,,0.0,19169.0,0


# Clustering the Enmo and Anglez

In [29]:
def clustering(df):
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler    
    X=df[['anglez','enmo']]
#Scalling the data
    scaler=StandardScaler()
    X_scaled=scaler.fit_transform(X)
#perform clustering
    model=KMeans(n_clusters=4 )
    model.fit(X_scaled)
    return model.labels_
df_series['cluster']=(clustering(df_series)+1)/4

# Adding Rolling Standard Deviations

- As per observation enmo and anglez varies frequently and more than normal for few timesteps before and after the event.
- In order to consider the variations into the modelling, using the method.

In [25]:
def rollingstd(df):
    df['sd_anglez_1']=pd.to_numeric(df['anglez'].rolling(window=12).std())
    df['sd_enmo_1']=pd.to_numeric(df['enmo'].rolling(window=12).std())
    df['sd_anglez_1'].fillna('0.0',inplace=True)
    df['sd_enmo_1'].fillna('0.0',inplace=True)
    df['sleep'] = df['sleep'].replace({0:False, 0.2:True})
    return(df)
series_df=rollingstd(df_series)

In [30]:
series_df

Unnamed: 0,series_id,step,timestamp,anglez,enmo,event,sleep,window,cluster,sd_anglez_1,sd_enmo_1
4632,038441c925bb,4632,2018-08-14T21:56:00-0400,21.841999,0.0002,,False,0.0,0.25,0.0,0.0
4633,038441c925bb,4633,2018-08-14T21:56:05-0400,28.535900,0.0019,,False,0.0,0.25,0.0,0.0
4634,038441c925bb,4634,2018-08-14T21:56:10-0400,34.446499,0.0011,,False,0.0,0.25,0.0,0.0
4635,038441c925bb,4635,2018-08-14T21:56:15-0400,37.184200,0.0003,,False,0.0,0.25,0.0,0.0
4636,038441c925bb,4636,2018-08-14T21:56:20-0400,35.196499,0.0015,,False,0.0,0.25,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
127935920,fe90110788d2,581960,2017-09-07T09:46:40-0400,-55.010601,0.0834,,False,19169.0,1.00,0.681463,0.023997
127935921,fe90110788d2,581961,2017-09-07T09:46:45-0400,-22.163601,0.1773,,False,19169.0,0.50,8.895862,0.05441
127935922,fe90110788d2,581962,2017-09-07T09:46:50-0400,7.853500,0.1868,,False,19169.0,0.50,18.938735,0.071546
127935923,fe90110788d2,581963,2017-09-07T09:46:55-0400,34.740799,0.0798,,False,19169.0,0.25,29.633575,0.071438


In [33]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
X=series_df[['sd_anglez_1','sd_enmo_1','anglez','enmo','cluster']]
y=series_df[['sleep']]
scaler=preprocessing.StandardScaler().fit(X[['anglez','sd_anglez_1']])
X[['anglez','sd_anglez_1']] = scaler.transform(X[['anglez','sd_anglez_1']])
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.2, random_state=42)

In [34]:
model_nn=tf.keras.Sequential([
  tf.keras.layers.Dense(19170,input_shape=[len(X_train.keys())]),
  tf.keras.layers.Dense(1900,activation=tf.nn.softmax, use_bias=False),
  tf.keras.layers.Dense(190,activation=tf.nn.softmax, use_bias=False),
  tf.keras.layers.Dense(19,activation=tf.nn.relu, use_bias=False),
  tf.keras.layers.Dense(2,activation=tf.nn.relu, use_bias=False),
  tf.keras.layers.Dense(1, activation=tf.nn.softmax)
])
#model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001) , loss='mse', metrics=['mae','mse'])
model_nn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001) , loss='mse', metrics=['mae','mse','accuracy'])

model_nn.summary()

Unnamed: 0,sd_anglez_1,sd_enmo_1,anglez,enmo,cluster
4632,-0.700387,0.0,0.725219,0.0002,0.25
4633,-0.700387,0.0,0.879089,0.0019,0.25
4634,-0.700387,0.0,1.014954,0.0011,0.25
4635,-0.700387,0.0,1.077885,0.0003,0.25
4636,-0.700387,0.0,1.032194,0.0015,0.25
...,...,...,...,...,...
127935920,-0.644502,0.023997,-1.041364,0.0834,1.00
127935921,0.029140,0.05441,-0.286322,0.1773,0.50
127935922,0.852731,0.071546,0.403670,0.1868,0.50
127935923,1.729788,0.071438,1.021719,0.0798,0.25
