# CMI-SleepState-Detection
## Child Mind Institute - Detect Sleep States
### Detect sleep onset and wake from wrist-worn accelerometer data
_______________________________________________________________________
# [Kaggle Competition](https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states/overview)
________________________________________________________________________
# Author Details:
### Name: Najeeb Haider Zaidi
### Email: zaidi.nh@gmail.com
### Profiles: [Github](https://github.com/snajeebz)  [LinkedIn](https://www.linkedin.com/in/najeebz) [Kaggle](https://www.kaggle.com/najeebz)
### License: Private, Unlicensed, All the files in this repository under any branch are Prohibited to be used commercially or for personally, communally or privately unless permitted by author in writing.
### Copyrights 2023-2024 (c) are reserved only by the author: Najeeb Haider Zaidi
________________________________________________________________________
# Attributions:
## The Dataset has been provided by Child Mind Institute. in [Kaggle Competition](https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states/overview) which the author is participating in and authorized to use the dataset solely for the competition purposes.
________________________________________________________________________

### [Open in Kaggle](https://www.kaggle.com/najeebz/submission-preparation-notebook/)
________________________________________________________________________

# Installing and Importing and Initializing Libraries.

In [None]:
!pip install pandarallel

In [None]:
import plotly.express as px
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

In [None]:
import numpy as np # linear algebra
import pandas as pd# data processing, CSV file I/O (e.g. pd.read_csv)
#from pandarallel import pandarallel

pd.set_option('display.max_row', 500)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#pandarallel.initialize(progress_bar=True)

In [None]:
result_df

In [None]:
df=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet", engine='auto')

# Windowing the training data
### As the target is to detect an event, 
### So, extracting timesteps 30 min before and after the events. 

In [None]:
def window(df, win_size):
    ind=df.index[df['event'].isna()==False]
    c=0
    df['window']=np.nan
    for i in tqdm(ind):
        a=i-win_size
        b=i+win_size
        df['window'].loc[a:i]=int(c)
        c=c+1
        df['window'].loc[i:b]=int(c)
        c=c+1
    df['window'].dropna(inplace=True)
    return df[df['window'].isna()==False]

#df_series=df_series[df_series['window'].isna()==False]

# Removing the steps where change in enmo and anglez is insignificant (device is supposed not to be worn)

In [None]:
def inactive_periods(df):
    print("shape before application: ",df.shape)
    df['diff_anglez']=df['anglez'].diff()
    df=df[(df['enmo']!=0.0) | (df['diff_anglez']!=0.0)]
    print("shape after application: ",df.shape)
    df.drop('diff_anglez', inplace=True, axis=1)
    print("shape after completion: ",df.shape)
    print("removed ")
    return df


# Clustering the Enmo and Anglez

In [None]:
def clustering(df):
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler    
    X=df[['anglez','enmo']]
#Scalling the data
    scaler=StandardScaler()
    X_scaled=scaler.fit_transform(X)
#perform clustering
    model=KMeans(n_clusters=4,algorithm="elkan" )
    model.fit(X_scaled)
    return model.labels_


# Adding Rolling Standard Deviations

- As per observation enmo and anglez varies frequently and more than normal for few timesteps before and after the event.
- In order to consider the variations into the modelling, using the method.

In [None]:
 def rollingstd(series_df):
# Creating columns with nans
    series_df['sd_enmo_1']=np.nan    # 1 min rolling std: enmo
    series_df['sd_anglez_1']=np.nan  # 1 min rolling std: anglez
    series_df['m_enmo_2']=np.nan     # 2 min rolling mean: enmo
    series_df['m_anglez_2']=np.nan   # 2 min rolling std: anglez 
    print('anglez rolling std 12')
    series_df['sd_anglez_1'] = (series_df.groupby('series_id')['anglez']
                      .rolling(12)
                      .std()
                      .reset_index(level=0, drop=True))
    print('anglez rolling std 2')
    series_df['sd_anglez_1'][series_df['sd_anglez_1'].isna()==True] = (series_df.groupby('series_id')['anglez']
                      .rolling(2)
                      .std()
                      .reset_index(level=0, drop=True))
    print('enmo rolling std 12')
    series_df['sd_enmo_1'] = (series_df.groupby('series_id')['enmo']
                      .rolling(12)
                      .std()
                      .reset_index(level=0, drop=True))
    print('enmo rolling mean 24')
    series_df['m_enmo_2'] = (series_df.groupby('series_id')['enmo']
                      .rolling(24)
                      .mean()
                      .reset_index(level=0, drop=True))
    print('anglez rolling mean 24')
    series_df['m_anglez_2'] = (series_df.groupby('series_id')['anglez']
                      .rolling(24)
                      .mean()
                      .reset_index(level=0, drop=True))
    print('enmo rolling std 2')
    print('Nans in sd_emno_1: ',series_df['sd_enmo_1'].isnull().sum())
    series_df['sd_enmo_1'][series_df['sd_enmo_1'].isna()==True] = (series_df.groupby('series_id')['enmo']
                      .rolling(2)
                      .std()
                      .reset_index(level=0, drop=True))
    print('enmo rolling mean 2')
    series_df['m_enmo_2'][series_df['m_enmo_2'].isna()==True] = (series_df.groupby('series_id')['enmo']
                      .rolling(2)
                      .mean()
                      .reset_index(level=0, drop=True))
    print('anglez rolling mean 2')
    series_df['m_anglez_2'][series_df['m_anglez_2'].isna()==True] = (series_df.groupby('series_id')['anglez']
                      .rolling(2)
                      .mean()
                      .reset_index(level=0, drop=True))
#Series wise rolling std and mean
# filling rest of nans
    print('Nans in sd_emno_1: ',series_df['sd_enmo_1'].isnull().sum())
    series_df['sd_enmo_1'].fillna(0.0, inplace=True)
    series_df['sd_anglez_1'].fillna(0.0, inplace=True)
    series_df['m_enmo_2'].fillna(0.0, inplace=True)
    series_df['m_anglez_2'].fillna(0.0, inplace=True)
    print('Nans after removal: ',series_df['sd_enmo_1'].isnull().sum())

    return(series_df)


# Scaling the data

In [None]:
def scale(X):
    from sklearn import preprocessing
    scaler=preprocessing.StandardScaler().fit(X)
    return (scaler.transform(X))


# Creating Training Data

In [None]:
# Importing the datasets
print('Importing Training Datasets')
df_series=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet", engine='auto')
df_events=pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")
print('Dataset Imported...')
print('______________________________________')

# Merging the datasets
print('Merging the training datasets...')
events=df_events[['series_id', 'step','event']]
series_df=pd.merge(df_series,events,on=["step","series_id"],how='left')
df_series=[]
df_event=[]
series_df['sleep']=np.nan
series_df.loc[series_df["event"]=="onset", "sleep"] = 1
series_df.loc[series_df["event"]=="wakeup", "sleep"] = 0
series_df['sleep'].fillna(method='ffill', inplace=True)
series_df['sleep'].fillna(value=0, inplace=True)
print('Datasets Merged...')
print('______________________________________')

# Removing the periods of inactivity
print('Removing the periods of Inactivity...')
series_df=inactive_periods(series_df)
print('______________________________________')

# Forming Windows
win_size=720  #60mins
print('Creating Windows each size: ',win_size)
series_df=window(series_df,win_size)
print('Windows formed...')
print('______________________________________')

# Adding the columns of Standard Deviation (1 min)
print('Adding columns to account for deviation in enmo and anglez 1 min rolling...')
series_df=rollingstd(series_df)
series_df['sd_anglez_1']=pd.to_numeric(series_df['sd_anglez_1'])
series_df['sd_enmo_1']=pd.to_numeric(series_df['sd_enmo_1'])
series_df['m_anglez_2']=pd.to_numeric(series_df['m_anglez_2'])
series_df['m_enmo_2']=pd.to_numeric(series_df['m_enmo_2'])
print('Std columns added...')
print('______________________________________')

# Clustering the Data
print('Clustering the data based on enmo and anglez...')
series_df['cluster']=(clustering(series_df)+1)/4
print('Added clusters...')





In [None]:
series_df.dtypes

In [None]:
figure= px.imshow(series_df[['sd_anglez_1','sd_enmo_1','m_anglez_2','m_enmo_2','anglez','enmo','cluster','sleep']].corr(),text_auto=True, width=1200, height=1200)
figure.show()

# Creating Train Test Data from the Training Data

In [None]:
from sklearn.model_selection import train_test_split
X=series_df[['sd_anglez_1','sd_enmo_1','anglez','m_anglez_2','m_enmo_2','enmo','cluster']]
y_test=series_df[['sleep']]
X_test=scale(X)
#X_train, X_test, y_train, y_test =train_test_split(X_scaled,y,test_size=0.2, random_state=42)
del X


In [None]:
y_test[['sleep']].value_counts()

# Function to evaluate the training (scikit-Learn)

In [None]:
def evaluate(y_test,ypred):
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    print("Accuracy: ",accuracy_score(y_test,y_pred)) 
    print("Precision Score : ", precision_score(y_test,y_pred)) #precision measures the proportion of true positive predictions among all positive instances. how many of survived predicted actually survived, doesn't verifies 0's 70 survived as preicted whereas actually 92 survived so 70/92 will be the precision.  if we predicted 70 survived, so presion will tell how many of those 70 predicted survived matches the actual row by row data. It checkes all positives and verifies if the answer is true for each row?
    print("Recall Score: ", recall_score(y_test,y_pred, average='macro')) #Recall measures the proportion of true positive predictions among all actual positive instalnces. If we predicted 100 survived correctly whereas actually 100 survived out of which 67 predicted correctly so recall will be 0.67
    print("F1 Score: ",f1_score(y_test,y_pred)) #mean of recall and precision
    cm = confusion_matrix(y_test, y_pred)
    figure= px.imshow(cm,text_auto=True, width=1200, height=1200)
    figure.show()


In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', 
              max_iter =100, 
              alpha=10, 
              hidden_layer_sizes=5, 
              random_state=5,
              activation='relu',
              batch_size=360, 
              learning_rate='adaptive', 
              verbose=1,
              early_stopping=1, 
              n_iter_no_change=10)

print ('Training the model')
clf.fit(X_train,y_train)
print(clf.score(X_train,y_train))
print ('Saving the model')
from joblib import dump, load
dump(clf, 'mlp_model.joblib')
y_pred=clf.predict(X_test)
evaluate(y_test,y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier(n_jobs=-1,verbose=1) 
print ('Training the model')
rf.fit(X_test,y_test)
print ('Saving the model')
from joblib import dump, load
dump(rf, 'rf_model.joblib')
y_pred=rf.predict(X_test)
evaluate(y_test,y_pred)
X_test=[]

In [None]:
from joblib import dump, load
model=load('rf_model.joblib')
y_pred=model.predict(X_test)
#evaluate(y_test,y_pred)
X_test=[]

In [None]:
result_df=series_df[['series_id', 'step','timestamp']]
result_df['sleep']=y_pred
result_df['timestamp']=result_df[['timestamp']].progress_apply(lambda x: pd.to_datetime(x,utc=True))
df=result_df.copy()
df.index=df['timestamp']
mean = df.groupby([df['series_id'], df.index.floor('30min')])['sleep'].mean()  # Calculating the mean of predictions over an interval of 30 mins. 
mean=mean.reset_index()
mean['timestamp']=mean['timestamp']- pd.to_timedelta('30m') # Since the event is recorded at the end of the interval so subtracting 30 mins so it records the event at the start of the interval
summary=pd.merge(result_df,mean,on=["timestamp","series_id"],how='left')  # merging the means into the original data based on timestamps and series ID.
summary=summary[summary['sleep_y'].isna()==False]  # removing the Nan's of prediction mean. That'll ensure that we have a row every 30 mins.
# Creating Event Column
summary['event']=np.nan
summary.loc[summary["sleep_y"]==1, "event"] = 'onset'  # the mean prediction will be 1 if predicted onset for 30 mins consecutive
summary.loc[summary["sleep_y"]==0, "event"] = 'wakeup' # the mean prediction will be 0 if predicted wakeup for 30 mins consecutive. Any duration in between will be considered disturbance as will be less tan 30 mins.
summary=summary[summary['event'].isna()==False] # Removing the rows with no event recorded. 
submission=summary[['series_id','step','event','sleep_y']]  # Creating Submission
submission = submission.rename(columns={'sleep_y': 'score'})  # Renaming a column
submission
#submission.to_csv('submission.csv')  # Saving the csv file

In [None]:
result_df

In [None]:
df=result_df.copy()
df.index=df['timestamp']
mean = df.groupby([df['series_id'], df.index.floor('30min')])['sleep'].mean()  # Calculating the mean of predictions over an interval of 30 mins. 
mean=mean.reset_index()
mean['timestamp']=mean['timestamp']- pd.to_timedelta('30m')
mean=mean.drop(columns='index')


In [None]:
df

In [None]:
summary=pd.merge(result_df,mean,on=["timestamp","series_id"],how='left')  # merging the means into the original data based on timestamps and series ID.
summary=summary[summary['sleep_y'].isna()==False]  # removing the Nan's of prediction mean. That'll ensure that we have a row every 30 mins.
# Creating Event Column
summary['event']=np.nan
summary.loc[summary["sleep_y"]==1, "event"] = 'onset'  # the mean prediction will be 1 if predicted onset for 30 mins consecutive
summary.loc[summary["sleep_y"]==0, "event"] = 'wakeup' # the mean prediction will be 0 if predicted wakeup for 30 mins consecutive. Any duration in between will be considered disturbance as will be less tan 30 mins.
summary=summary[summary['event'].isna()==False] # Removing the rows with no event recorded. 
submission=summary[['series_id','step','event','sleep_y']]  # Creating Submission
submission = submission.rename(columns={'sleep_y': 'score'})  # Renaming a column
submission.to_csv('submission.csv')  # Saving the csv file

In [None]:
mean=mean.reset_index()
mean['timestamp']=mean['timestamp']- pd.to_timedelta('30m')
mean=mean.drop(columns='index')
summary=pd.merge(result_df,mean,on=["timestamp","series_id"],how='left')
summary=summary[summary['sleep_y'].isna()==False]


In [None]:
mean

In [None]:
summary=pd.merge(result_df,mean,on=["timestamp","series_id"],how='left')

In [None]:
summary=summary[summary['sleep_y'].isna()==False]
summary

In [None]:
summary['event']=np.nan

In [None]:
summary.loc[summary["sleep_y"]==1, "event"] = 'onset'
summary.loc[summary["sleep_y"]==0, "event"] = 'wakeup'

In [None]:
summary=summary[summary['event'].isna()==False]

In [None]:
summary

In [None]:
submission=summary[['series_id','step','event','sleep_y']]
submission.to_csv('submission.csv')

In [None]:
model_nn=tf.keras.Sequential([
  tf.keras.layers.Dense(360,input_shape=[X_train.shape[1]]),
  tf.keras.layers.Dense(360,activation=tf.nn.leaky_relu, use_bias=True),
  tf.keras.layers.Dense(180,activation=tf.nn.relu),
  tf.keras.layers.Dense(90,activation=tf.nn.relu, use_bias=True),
  tf.keras.layers.Dense(20,activation=tf.nn.leaky_relu),
  tf.keras.layers.Dense(1, activation=tf.nn.sigmoid),
])
#model_nn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001) , 
#                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
#                metrics=['accuracy'])



In [None]:
epochs = 100
batch_size = win_size

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        "best_model.x", save_best_only=True, monitor="val_loss"
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=epochs, min_lr=0.0001
    ),
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=epochs, verbose=1),
]
model_nn.compile(
    optimizer="adam",
    #loss="sparse_categorical_crossentropy",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['binary_accuracy'],
)
model_nn.summary()

In [None]:
y_train.dtypes

In [None]:
history = model_nn.fit(
    X_train,
    y_train,
    batch_size=360,
    epochs=epochs,
    callbacks=callbacks,
    validation_split=0.2,
    verbose=1,
)

In [None]:
model = tf.keras.models.load_model("best_model.x")


In [None]:
metric = "binary_accuracy"
plt.figure()
plt.plot(history.history[metric])
plt.plot(history.history["val_" + metric])
plt.title("model " + metric)
plt.ylabel(metric, fontsize="large")
plt.xlabel("epoch", fontsize="large")
plt.legend(["train", "val"], loc="best")
plt.show()
plt.close()

In [None]:
model = tf.keras.models.load_model("best_model.x")

test_loss, test_acc = model.evaluate(X_test, y_test)

print("Test accuracy", test_acc)
print("Test loss", test_loss)

In [None]:
test.isnull().sum()

In [None]:
from joblib import dump, load
model=load('rf_model.joblib')

In [None]:
test=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet", engine='auto')
#test_df=test
test_df=rollingstd(test)
test_df['cluster']=(clustering(test_df)+1)/4
test_df['sd_anglez_1']=pd.to_numeric(test_df['sd_anglez_1'])
test_df['sd_enmo_1']=pd.to_numeric(test_df['sd_anglez_1'])
X=test_df[['sd_anglez_1','sd_enmo_1','anglez','m_anglez_2','m_enmo_2','enmo','cluster']]
y=model.predict(scale(X))


In [None]:
test['series_id'].unique()

In [None]:
test[test['series_id']=='0402a003dae9']

In [None]:
result_df=test_df[['series_id','step','timestamp']]

In [None]:
ls

In [None]:
result_df[['sleep']]

In [None]:
print(result_df.shape)
print(y.shape)
result_df[['sleep']]=np.nan
result_df[['sleep']]=result_df[['sleep']].apply(lambda x: y)

In [None]:
result_df[['timestamp']]=result_df[['timestamp']].apply(lambda x: pd.to_datetime(x,utc=True))

In [None]:
result_df['series_id'].unique()

In [None]:
result_df['diff']=np.nan
for series_id in tqdm(result_df['series_id'].unique()):
    result_df['diff'][result_df['series_id']==series_id]=result_df['sleep'].diff()
    

    


In [None]:
result_df['diff'].value_counts()

In [None]:
result_df['event']=np.nan
result_df.loc[result_df["diff"]==1, "event"] = 'onset'
result_df.loc[result_df["diff"]==-1, "event"] = 'wakeup'

In [None]:
result_df['event'].value_counts()

In [None]:
result_df.loc[(result_df["step"]==0) & (result_df["sleep"]==1), "event"] = 'onset'
result_df.loc[(result_df["step"]==0) & (result_df["sleep"]==0), "event"] = 'wakeup'
result_df.loc[(result_df["step"]==result_df['step'].groupby(result_df['series_id']).max()) & (result_df["sleep"]==1), "event"] = 'onset'
result_df.loc[(result_df["step"]==result_df['step'].groupby(result_df['series_id']).max()) & (result_df["sleep"]==0), "event"] = 'wakeup'

In [None]:
for series_id in tqdm(result_df['series_id'].unique():
    maxstep=result_df['step'][result_df['series_id']==series_id].max()
    result_df.loc[(result_df["step"]==maxstep) & (result_df["sleep"]==1), "event"] = 'onset'
    result_df.loc[(result_df["step"]==maxstep) & (result_df["sleep"]==0), "event"] = 'wakeup'

In [None]:
result_df

In [None]:
summary=result_df[result_df['event'].isna()==False]
summary.reset_index()


# deleting the events with timeperiods smaller than 30 mins.

In [None]:
import datetime as dt
summary['timediff']=summary.groupby(summary['series_id'])['timestamp'].diff().dt.total_seconds().div(60)
summary                                                

# Calculating the Score

In [None]:
summary['score']=np.nan
for i in tqdm(np.arange(0,len(summary)-1)):
    if i<len(summary)-1:
        next_index=summary['index'].index[i+1]
        next_series=summary['series_id'].index[i+1]
        this_series=summary['series_id'].index[i]
        this_index=summary['index'].index[i]
        summary['score'].index[i]=result_df['sleep'].loc[this_index:(next_index-1)].mean()
    if summary['event'].index[i]=='wakeup':
        summary['score'].index[i]=1-summary['score'].index[i]  #reversing the score for the wakeup events.
    else:
        print('completed')
            

In [None]:
summary['score']=np.nan
summary

In [None]:
len(summary)

In [None]:
summary.reset_index()

In [None]:
result_df['score']=np.nan
result_df['score']=result_df['sleep'].loc[:iloc[results_df['event'].isna()==False.mean()