# Installing and Importing and Initializing Libraries.

In [None]:
!pip install pandarallel

In [1]:
import plotly.express as px
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

In [2]:
import numpy as np # linear algebra
import pandas as pd# data processing, CSV file I/O (e.g. pd.read_csv)
#from pandarallel import pandarallel

pd.set_option('display.max_row', 500)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#pandarallel.initialize(progress_bar=True)

/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet


In [2]:
df=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet", engine='auto')

# Windowing the training data
### As the target is to detect an event, 
### So, extracting timesteps 30 min before and after the events. 

In [3]:
def window(df, win_size):
    ind=df.index[df['event'].isna()==False]
    c=0
    df['window']=np.nan
    for i in tqdm(ind):
        a=i-win_size
        b=i+win_size
        df['window'].loc[a:i]=int(c)
        c=c+1
        df['window'].loc[i:b]=int(c)
        c=c+1
    df['window'].dropna(inplace=True)
    return df[df['window'].isna()==False]

#df_series=df_series[df_series['window'].isna()==False]

# Removing the steps where change in enmo and anglez is insignificant (device is supposed not to be worn)

In [36]:
def inactive_periods(df):
    print("shape before application: ",df.shape)
    df['diff_anglez']=df['anglez'].diff()
    df=df[(df['enmo']!=0.0) | (df['diff_anglez']!=0.0)]
    print("shape after application: ",df.shape)
    df.drop('diff_anglez', inplace=True, axis=1)
    print("shape after completion: ",df.shape)
    print("removed ")
    return df


# Clustering the Enmo and Anglez

In [6]:
def clustering(df):
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler    
    X=df[['anglez','enmo']]
#Scalling the data
    scaler=StandardScaler()
    X_scaled=scaler.fit_transform(X)
#perform clustering
    model=KMeans(n_clusters=4,algorithm="elkan" )
    model.fit(X_scaled)
    return model.labels_


# Adding Rolling Standard Deviations

- As per observation enmo and anglez varies frequently and more than normal for few timesteps before and after the event.
- In order to consider the variations into the modelling, using the method.

In [26]:
def rollingstd(df):
# Creating columns with nans
    df['sd_enmo_1']=np.nan    # 1 min rolling std: enmo
    df['sd_anglez_1']=np.nan  # 1 min rolling std: anglez
    df['m_enmo_2']=np.nan     # 2 min rolling mean: enmo
    df['m_anglez_2']=np.nan   # 2 min rolling std: anglez 

#Series wise rolling std and mean
    
    for series_id in tqdm(df['series_id'].unique()):
# STD of enmo
        df['sd_enmo_1'][df['series_id']==series_id]=df['enmo'][df['series_id']==series_id].rolling(window=12).std()
        df['sd_enmo_1'][(df['series_id']==series_id) & (df['sd_enmo_1'].isna()==True)]=df['enmo'][df['series_id']==series_id].rolling(window=2).std()
        
# STD of Anglez
        df['sd_anglez_1'][df['series_id']==series_id]=df['anglez'][df['series_id']==series_id].rolling(window=12).std()
        df['sd_anglez_1'][(df['series_id']==series_id) & (df['sd_anglez_1'].isna()==True)]=df['anglez'][df['series_id']==series_id].rolling(window=2).std()
# Mean of enmo
        df['m_enmo_2'][df['series_id']==series_id]=df['enmo'][df['series_id']==series_id].rolling(window=24).mean()
        df['m_enmo_2'][(df['series_id']==series_id) & (df['sd_enmo_1'].isna()==True)]=df['enmo'][df['series_id']==series_id].rolling(window=2).mean()
        
# Mean of Anglez
        df['m_anglez_2'][df['series_id']==series_id]=df['anglez'][df['series_id']==series_id].rolling(window=24).mean()
        df['m_anglez_2'][(df['series_id']==series_id) & (df['sd_anglez_1'].isna()==True)]=df['anglez'][df['series_id']==series_id].rolling(window=2).mean()
        
# filling rest of nans
    print('Nans in sd_emno_1: ',series_df['sd_enmo_1'].isnull().sum())
    df['sd_enmo_1'].fillna(0.0, inplace=True)
    df['sd_anglez_1'].fillna(0.0, inplace=True)
    df['m_enmo_2'].fillna(0.0, inplace=True)
    df['m_anglez_2'].fillna(0.0, inplace=True)
    print('Nans after removal: ',series_df['sd_enmo_1'].isnull().sum())

    return(df)


# Scaling the data

In [8]:
def scale(X):
    from sklearn import preprocessing
    scaler=preprocessing.StandardScaler().fit(X)
    return (scaler.transform(X))


# Creating Training Data

In [9]:
# Importing the datasets
print('Importing Training Datasets')
df_series=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet", engine='auto')
df_events=pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")
print('Dataset Imported...')
print('______________________________________')

# Merging the datasets
print('Merging the training datasets...')
events=df_events[['series_id', 'step','event']]
series_df=pd.merge(df_series,events,on=["step","series_id"],how='left')
df_series=[]
series_df['sleep']=np.nan
series_df.loc[series_df["event"]=="onset", "sleep"] = 1
series_df.loc[series_df["event"]=="wakeup", "sleep"] = 0
series_df['sleep'].fillna(method='ffill', inplace=True)
series_df['sleep'].fillna(value=0, inplace=True)
print('Datasets Merged...')
print('______________________________________')

# Removing the periods of inactivity
print('Removing the periods of Inactivity...')
series_df=inactive_periods(series_df)
print('______________________________________')

# Forming Windows
win_size=720  #600mins
print('Creating Windows each size: ',win_size)
series_df=window(series_df,win_size)
print('Windows formed...')
print('______________________________________')

# Adding the columns of Standard Deviation (1 min)
print('Adding columns to account for deviation in enmo and anglez 1 min rolling...')
series_df=rollingstd(series_df)
series_df['sd_anglez_1']=pd.to_numeric(series_df['sd_anglez_1'])
series_df['sd_enmo_1']=pd.to_numeric(series_df['sd_enmo_1'])
series_df['m_anglez_2']=pd.to_numeric(series_df['m_anglez_2'])
series_df['m_enmo_2']=pd.to_numeric(series_df['m_enmo_2'])
print('Std columns added...')
print('______________________________________')

# Clustering the Data
print('Clustering the data based on enmo and anglez...')
series_df['cluster']=(clustering(series_df)+1)/4
print('Added clusters...')





Importing Training Datasets
Dataset Imported...
______________________________________
Merging the training datasets...
Datasets Merged...
______________________________________
Removing the periods of Inactivity...
shape before application:  (127946340, 7)
shape after application:  (111766109, 8)
shape after completion:  (111766109, 7)
removed 
______________________________________
Creating Windows each size:  720


100%|██████████| 6928/6928 [00:04<00:00, 1464.93it/s]


Windows formed...
______________________________________
Adding columns to account for deviation in enmo and anglez 1 min rolling...


100%|██████████| 269/269 [38:39<00:00,  8.62s/it]


Nans in sd_emno_1:  269
Nans after removal:  0
Std columns added...
______________________________________
Clustering the data based on enmo and anglez...
Added clusters...


In [None]:
series_df.dtypes

In [11]:
figure= px.imshow(series_df[['sd_anglez_1','sd_enmo_1','m_anglez_2','m_enmo_2','anglez','enmo','cluster','sleep']].corr(),text_auto=True, width=1200, height=1200)
figure.show()

# Creating Train Test Data from the Training Data

In [10]:
from sklearn.model_selection import train_test_split
X=series_df[['sd_anglez_1','sd_enmo_1','anglez','m_anglez_2','m_enmo_2','enmo','cluster']]
y=series_df[['sleep']]
X_scaled=scale(X)
X_train, X_test, y_train, y_test =train_test_split(X_scaled,y,test_size=0.2, random_state=42)

In [14]:
y_train[['sleep']].value_counts()

sleep
0.0      3843254
1.0      3012876
Name: count, dtype: int64

# Function to evaluate the training (scikit-Learn)

In [11]:
def evaluate(y_test,ypred):
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    print("Accuracy: ",accuracy_score(y_test,y_pred)) 
    print("Precision Score : ", precision_score(y_test,y_pred)) #precision measures the proportion of true positive predictions among all positive instances. how many of survived predicted actually survived, doesn't verifies 0's 70 survived as preicted whereas actually 92 survived so 70/92 will be the precision.  if we predicted 70 survived, so presion will tell how many of those 70 predicted survived matches the actual row by row data. It checkes all positives and verifies if the answer is true for each row?
    print("Recall Score: ", recall_score(y_test,y_pred, average='macro')) #Recall measures the proportion of true positive predictions among all actual positive instalnces. If we predicted 100 survived correctly whereas actually 100 survived out of which 67 predicted correctly so recall will be 0.67
    print("F1 Score: ",f1_score(y_test,y_pred)) #mean of recall and precision
    cm = confusion_matrix(y_test, y_pred)
    figure= px.imshow(cm,text_auto=True, width=1200, height=1200)
    figure.show()


In [20]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', 
              max_iter =100, 
              alpha=10, 
              hidden_layer_sizes=5, 
              random_state=5,
              activation='relu',
              batch_size=360, 
              learning_rate='adaptive', 
              verbose=1,
              early_stopping=1, 
              n_iter_no_change=10)

print ('Training the model')
clf.fit(X_train,y_train)
print(clf.score(X_train,y_train))
print ('Saving the model')
from joblib import dump, load
dump(clf, 'mlp_model.joblib')
y_pred=clf.predict(X_test)
evaluate(y_test,y_pred)

Training the model
Iteration 1, loss = 0.51608730
Validation score: 0.814309
Iteration 2, loss = 0.50522723
Validation score: 0.814780
Iteration 3, loss = 0.50522715
Validation score: 0.817323
Iteration 4, loss = 0.50522257
Validation score: 0.814906
Iteration 5, loss = 0.50522679
Validation score: 0.815136
Iteration 6, loss = 0.50522056
Validation score: 0.814742
Iteration 7, loss = 0.50521858
Validation score: 0.814088
Iteration 8, loss = 0.50522639
Validation score: 0.813784
Iteration 9, loss = 0.50522339
Validation score: 0.811958
Iteration 10, loss = 0.50522283
Validation score: 0.813705
Iteration 11, loss = 0.50522408
Validation score: 0.816604
Iteration 12, loss = 0.50521738
Validation score: 0.815572
Iteration 13, loss = 0.50521730
Validation score: 0.815085
Iteration 14, loss = 0.50522087
Validation score: 0.813333
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
0.8172689257642431
Saving the model
Accuracy:  0.8173308215186056
Preci

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier(n_jobs=-1,verbose=1) 
print ('Training the model')
rf.fit(X_train,y_train)
print ('Saving the model')
from joblib import dump, load
dump(rf, 'rf_model.joblib')
y_pred=rf.predict(X_test)
evaluate(y_test,y_pred)

Training the model


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 26.5min finished


Saving the model


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   18.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   42.2s finished


Accuracy:  0.9164543506455243
Precision Score :  0.9327879518994555
Recall Score:  0.9117764242893862
F1 Score:  0.9018673976357718


In [None]:
X_train.shape[1]
input_size=X_train.shape[1]*win_size


In [None]:
model_nn=tf.keras.Sequential([
  tf.keras.layers.Dense(360,input_shape=[X_train.shape[1]]),
  tf.keras.layers.Dense(360,activation=tf.nn.leaky_relu, use_bias=True),
  tf.keras.layers.Dense(180,activation=tf.nn.relu),
  tf.keras.layers.Dense(90,activation=tf.nn.relu, use_bias=True),
  tf.keras.layers.Dense(20,activation=tf.nn.leaky_relu),
  tf.keras.layers.Dense(1, activation=tf.nn.sigmoid),
])
#model_nn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001) , 
#                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
#                metrics=['accuracy'])



In [None]:
epochs = 100
batch_size = win_size

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        "best_model.x", save_best_only=True, monitor="val_loss"
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=epochs, min_lr=0.0001
    ),
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=epochs, verbose=1),
]
model_nn.compile(
    optimizer="adam",
    #loss="sparse_categorical_crossentropy",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['binary_accuracy'],
)
model_nn.summary()

In [None]:
y_train.dtypes

In [None]:
history = model_nn.fit(
    X_train,
    y_train,
    batch_size=360,
    epochs=epochs,
    callbacks=callbacks,
    validation_split=0.2,
    verbose=1,
)

In [None]:
model = tf.keras.models.load_model("best_model.x")


In [None]:
metric = "binary_accuracy"
plt.figure()
plt.plot(history.history[metric])
plt.plot(history.history["val_" + metric])
plt.title("model " + metric)
plt.ylabel(metric, fontsize="large")
plt.xlabel("epoch", fontsize="large")
plt.legend(["train", "val"], loc="best")
plt.show()
plt.close()

In [None]:
model = tf.keras.models.load_model("best_model.x")

test_loss, test_acc = model.evaluate(X_test, y_test)

print("Test accuracy", test_acc)
print("Test loss", test_loss)

In [52]:
test.isnull().sum()

series_id      0
step           0
timestamp      0
anglez         0
enmo           0
sd_enmo_1      0
sd_anglez_1    0
m_enmo_2       0
m_anglez_2     0
cluster        0
dtype: int64

In [50]:
test=inactive_periods(pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet", engine='auto'))
#test_df=test
test_df=rollingstd(test)
test_df['cluster']=(clustering(test_df)+1)/4
test_df['sd_anglez_1']=pd.to_numeric(test_df['sd_anglez_1'])
test_df['sd_enmo_1']=pd.to_numeric(test_df['sd_anglez_1'])
X=test_df[['sd_anglez_1','sd_enmo_1','anglez','m_anglez_2','m_enmo_2','enmo','cluster']]
y=rf.predict(X)


shape before application:  (450, 5)
shape after application:  (322, 6)
shape after completion:  (322, 5)
removed 


100%|██████████| 3/3 [00:00<00:00, 29.69it/s]

Nans in sd_emno_1:  0
Nans after removal:  0



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [92]:
result_df=test_df[['series_id','step','timestamp']]

In [93]:
result_df[['sleep']]

Unnamed: 0,series_id,step,timestamp
0,038441c925bb,0,2018-08-14T15:30:00-0400
1,038441c925bb,1,2018-08-14T15:30:05-0400
2,038441c925bb,2,2018-08-14T15:30:10-0400
3,038441c925bb,3,2018-08-14T15:30:15-0400
4,038441c925bb,4,2018-08-14T15:30:20-0400
5,038441c925bb,5,2018-08-14T15:30:25-0400
6,038441c925bb,6,2018-08-14T15:30:30-0400
7,038441c925bb,7,2018-08-14T15:30:35-0400
8,038441c925bb,8,2018-08-14T15:30:40-0400
9,038441c925bb,9,2018-08-14T15:30:45-0400


In [100]:
print(result_df.shape)
print(y.shape)
result_df[['sleep']]=np.nan
result_df[['sleep']]=result_df[['sleep']].apply(lambda x: y)

(322, 4)
(322,)


In [95]:
result_df[['timestamp']]=result_df[['timestamp']].apply(lambda x: pd.to_datetime(x,utc=True))

In [108]:
result_df['series_id'].unique()

array(['038441c925bb', '03d92c9f6f8a', '0402a003dae9'], dtype=object)

In [116]:
print(result_df['sleep'][result_df['series_id']=='0402a003dae9'].sum())
print(result_df['sleep'][result_df['series_id']=='0402a003dae9'].value_counts())


58.0
sleep
0.0    92
1.0    58
Name: count, dtype: int64


In [87]:
result_df[['steps']].groupby(pd.Grouper(result_df[["timestamp"]], freq="30min"))

KeyError: "None of [Index(['steps'], dtype='object')] are in the [columns]"