# Installing and Importing and Initializing Libraries.

In [None]:
!pip install pandarallel

In [None]:
import numpy as np # linear algebra
import pandas as pd# data processing, CSV file I/O (e.g. pd.read_csv)
#from pandarallel import pandarallel
import plotly.express as px
import tensorflow as tf
import matplotlib.pyplot as plt
pd.set_option('display.max_row', 500)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from tqdm import tqdm
tqdm.pandas()
#pandarallel.initialize(progress_bar=True)

# Windowing the training data
### As the target is to detect an event, 
### So, extracting timesteps 30 min before and after the events. 

In [None]:
def window(df, win_size):
    ind=df.index[df['event'].isna()==False]
    c=0
    df['window']=np.nan
    for i in tqdm(ind):
        a=i-win_size
        b=i+win_size
        df['window'].loc[a:i]=int(c)
        c=c+1
        df['window'].loc[i:b]=int(c)
        c=c+1
    df['window'].dropna(inplace=True)
    return df[df['window'].isna()==False]

#df_series=df_series[df_series['window'].isna()==False]

# Removing the steps where change in enmo and anglez is insignificant (device is supposed not to be worn)

In [None]:
def inactive_periods(df_series):
    print("shape before application: ",df_series.shape)
    df_series['diff_enmo']=df_series['enmo'].diff()
    df_series['diff_anglez']=df_series['anglez'].diff()
    df_series=df_series[(df_series['diff_enmo']!=0.0) & (df_series['diff_anglez']!=0.0)]
    print("shape after application: ",df_series.shape)
    df_series.drop('diff_enmo', inplace=True, axis=1)
    df_series.drop('diff_anglez', inplace=True, axis=1)
    print("shape after completion: ",df_series.shape)
    print("removed ")
    return df_series


# Clustering the Enmo and Anglez

In [None]:
def clustering(df):
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler    
    X=df[['anglez','enmo','sd_anglez_1','sd_enmo_1']]
#Scalling the data
    scaler=StandardScaler()
    X_scaled=scaler.fit_transform(X)
#perform clustering
    model=KMeans(n_clusters=4, verbose=1,algorithm="elkan" )
    model.fit(X_scaled)
    return model.labels_


# Adding Rolling Standard Deviations

- As per observation enmo and anglez varies frequently and more than normal for few timesteps before and after the event.
- In order to consider the variations into the modelling, using the method.

In [None]:
def rollingstd(df):
    df['sd_anglez_1']=df['anglez'].rolling(window=12).std()
    df['sd_enmo_1']=df['enmo'].rolling(window=12).std()
    df['sd_anglez_1'].fillna(0.0,inplace=True)
    df['sd_enmo_1'].fillna(0.0,inplace=True)
    return(df)


In [None]:
def scale(X):
    from sklearn import preprocessing
    scaler=preprocessing.MinMaxScaler().fit(X)
    return (scaler.transform(X))


In [None]:
# Importing the datasets
df_series=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet", engine='auto')
df_events=pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")
print('Dataset Imported...')
# Merging the datasets
events=df_events[['series_id', 'step','event']]
series_df=pd.merge(df_series,events,on=["step","series_id"],how='left')
series_df['sleep']=np.nan
series_df.loc[series_df["event"]=="onset", "sleep"] = 1
series_df.loc[series_df["event"]=="wakeup", "sleep"] = 0
series_df['sleep'].fillna(method='ffill', inplace=True)
series_df['sleep'].fillna(value=0, inplace=True)
print('Datasets Merged...')

# Removing the periods of inactivity
print('Periods of Inactivity...')
df_series=inactive_periods(series_df)
# Adding the columns of Standard Deviation (1 min)
series_df=rollingstd(df_series)
series_df['sd_anglez_1']=pd.to_numeric(series_df['sd_anglez_1'])
series_df['sd_enmo_1']=pd.to_numeric(series_df['sd_anglez_1'])
print('Std columns added...')

# Forming Windows
win_size=360
df_series=window(series_df,win_size)
print('Windows formed...')

# Clustering the Data
df_series['cluster']=(clustering(series_df)+1)/4
print('Added clusters...')




In [None]:
df_series.dtypes

In [None]:
from sklearn.model_selection import train_test_split
X=series_df[['sd_anglez_1','sd_enmo_1','anglez','enmo','cluster']]
y=series_df[['sleep']]
X_scaled=scale(X)
X_train, X_test, y_train, y_test =train_test_split(X_scaled,y,test_size=0.2, random_state=42)

In [None]:
y_train[['sleep']].value_counts()

In [None]:
def evaluate(y_test,ypred):
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    print("Accuracy: ",accuracy_score(y_test,y_pred)) 
    print("Precision Score : ", precision_score(y_test,y_pred)) #precision measures the proportion of true positive predictions among all positive instances. how many of survived predicted actually survived, doesn't verifies 0's 70 survived as preicted whereas actually 92 survived so 70/92 will be the precision.  if we predicted 70 survived, so presion will tell how many of those 70 predicted survived matches the actual row by row data. It checkes all positives and verifies if the answer is true for each row?
    print("Recall Score: ", recall_score(y_test,y_pred, average='macro')) #Recall measures the proportion of true positive predictions among all actual positive instalnces. If we predicted 100 survived correctly whereas actually 100 survived out of which 67 predicted correctly so recall will be 0.67
    print("F1 Score: ",f1_score(y_test,y_pred)) #mean of recall and precision
    cm=confusion_matrix(y_test, y_pred)
    figure= px.imshow(cm,text_auto=True, width=1200, height=1200)
    figure.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier(n_jobs=-1,verbose=1) 
print ('Training the model')
rf.fit(X_train,y_train)
print ('Saving the model')
#from joblib import dump, load
#dump(rf, 'rf_model.joblib')
y_pred=rf.predict(X_test)
evaluate(y_test,y_pred)

In [None]:
X_train.shape[1]
input_size=X_train.shape[1]*win_size


In [None]:
model_nn=tf.keras.Sequential([
  tf.keras.layers.Dense(360,input_shape=[X_train.shape[1]]),
  tf.keras.layers.Dense(360,activation=tf.nn.leaky_relu, use_bias=True),
  tf.keras.layers.Dense(180,activation=tf.nn.relu),
  tf.keras.layers.Dense(90,activation=tf.nn.relu, use_bias=True),
  tf.keras.layers.Dense(20,activation=tf.nn.leaky_relu),
  tf.keras.layers.Dense(1, activation=tf.nn.sigmoid),
])
#model_nn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001) , 
#                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
#                metrics=['accuracy'])



In [None]:
epochs = 100
batch_size = win_size

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        "best_model.x", save_best_only=True, monitor="val_loss"
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=epochs, min_lr=0.0001
    ),
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=epochs, verbose=1),
]
model_nn.compile(
    optimizer="adam",
    #loss="sparse_categorical_crossentropy",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['binary_accuracy'],
)
model_nn.summary()

In [None]:
y_train.dtypes

In [None]:
history = model_nn.fit(
    X_train,
    y_train,
    batch_size=360,
    epochs=epochs,
    callbacks=callbacks,
    validation_split=0.2,
    verbose=1,
)

In [None]:
model = tf.keras.models.load_model("best_model.x")


In [None]:
metric = "binary_accuracy"
plt.figure()
plt.plot(history.history[metric])
plt.plot(history.history["val_" + metric])
plt.title("model " + metric)
plt.ylabel(metric, fontsize="large")
plt.xlabel("epoch", fontsize="large")
plt.legend(["train", "val"], loc="best")
plt.show()
plt.close()

In [None]:
model = tf.keras.models.load_model("best_model.x")

test_loss, test_acc = model.evaluate(X_test, y_test)

print("Test accuracy", test_acc)
print("Test loss", test_loss)

In [None]:
test=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet", engine='auto')

In [None]:
test_df=test

In [None]:

test=inactive_periods(test)
test_df=test
test_df['cluster']=(clustering(test_df)+1)/4
test_df=rollingstd(test_df)
test_df['sd_anglez_1']=pd.to_numeric(test_df['sd_anglez_1'])
test_df['sd_enmo_1']=pd.to_numeric(test_df['sd_anglez_1'])
X=test_df[['sd_anglez_1','sd_enmo_1','anglez','enmo','cluster']]
y=np.absolute(model.predict(X))


In [None]:
y

In [None]:
test['sleep']=int(y)