In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import torch
import torch.nn as nn
from torch.autograd import Variable
from itertools import groupby

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

np.random.seed(0)
pd.set_option('display.precision', 2)  
pd.set_option('display.float_format', '{:.2f}'.format) 

In [None]:
train = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet")
test = pd.read_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet")

# Feature Engineering

In [None]:
train.head()

In [None]:
def add_features(df, with_y=True):
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    #     df.loc[:,'timestamp'] = pd.to_datetime(df["timestamp"],utc=True)
    df.loc[:,'hour'] = df['timestamp'].dt.hour
    df.loc[:,'date_minute'] = df['timestamp'].dt.strftime('%Y-%m-%d %H:%M')
    
    if with_y:
        agg_dict = {'anglez': ['min', 'max', 'mean', 'std'], 'enmo': ['min', 'max', 'mean', 'std'], 'step': 'first', 'awake': 'first'}
        rname_cols = {'step_first': 'step', 'awake_first': 'awake'}
    else:
        agg_dict = {'anglez': ['min', 'max', 'mean', 'std'], 'enmo': ['min', 'max', 'mean', 'std'], 'step': 'first'}
        rname_cols = {'step_first': 'step'}

    df_grp = df.groupby(['series_id','hour','date_minute']).agg(agg_dict)
    df_grp.columns = df_grp.columns.get_level_values(0) + '_' + df_grp.columns.get_level_values(1)
    df_grp.reset_index(inplace=True)
    
    periods=2
    df_grp.rename(columns=rname_cols, inplace=True)
    df_grp['anglez_diff'] = df_grp.groupby('series_id')['anglez_mean'].diff(periods=periods).fillna(method='bfill')
    df_grp['enmo_diff'] = df_grp.groupby('series_id')['enmo_mean'].diff(periods=periods).fillna(method='bfill')
    return df_grp

train_all = add_features(train)

In [None]:
train_all.head()

# Train

In [None]:
from sklearn.model_selection import train_test_split

columns = ['hour','anglez_min','anglez_max','anglez_mean','anglez_std','enmo_min','enmo_max','enmo_mean','enmo_std','anglez_diff', 'enmo_diff']
X_train, X_val, y_train, y_val = train_test_split(train_all[columns], train_all["awake"], random_state = 0)

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100,
                                    max_depth=10,
                                    min_samples_leaf=20)
classifier.fit(X_train, y_train)

In [None]:
predict_val = classifier.predict_proba(X_val) # Return the probabilities for each predicted outcome.

In [None]:
def accuracy(out, yb): 
    return (predict_val.argmax(axis=1) == yb).astype(float).mean() # Evaluate the accuracy of the model.

In [None]:
print(f'accuracy: {accuracy(predict_val, y_val)}')

In [None]:
predict_all = pd.concat([X_val.reset_index(drop=True), y_val.reset_index(drop=True), pd.Series(predict_val[:,1], name='score')], axis=1)

predict_all.head(20)

# Predict

In [None]:
test_series = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')
test = add_features(test_series, False)

In [None]:
test.head()

In [None]:
predict_test = classifier.predict_proba(test[columns]) 

In [None]:
test_with_pred = pd.concat([test.reset_index(drop=True), pd.Series(predict_test[:,1], name='score')], axis=1)

test_with_pred.head(20)

In [None]:
# Add a "not_awake" column as the complement of the "score" column:
test_with_pred["not_awake"] = 1 - test_with_pred["score"]

# Smoothing of the predictions:
smoothing_length = 2 * 250  # Define the length for smoothing
test_with_pred["smooth"] = test_with_pred["not_awake"].rolling(smoothing_length, center=True).mean().fillna(method="bfill").fillna(method="ffill")

# Re-binarize the "smooth" column:
test_with_pred["smooth"] = test_with_pred["smooth"].round()

In [None]:
# Define a function to extract events (onset and wakeup) from smoothed data:
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: (cv[0], cv[1] != 0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False:
            lstPOI.extend([0] * llg)
        else:
            lstPOI.extend(['onset'] + (llg - 2) * [0] + ['wakeup'] if llg > 1 else [0])
    return lstPOI

# Apply the event extraction function to create the "event" column in the 'test' DataFrame:
test_with_pred["event"] = get_event(test_with_pred)

In [None]:
test_with_pred.head()

In [None]:
# Select rows where "event" is not equal to 0 (indicating event detection):
sample_submission = test_with_pred.loc[test_with_pred["event"] != 0]

# Extract relevant columns and create a copy of the DataFrame:
sample_submission = sample_submission[["series_id", "step", "event", "score"]].copy()

# Reset the index and create a new "row_id" column:
sample_submission = sample_submission.reset_index(drop=True).reset_index(names="row_id")

In [None]:
sample_submission.head()

In [None]:
# Save the sample submission DataFrame to a CSV file:
sample_submission.to_csv('submission.csv', index=False)