#### Libraries

In [1]:
import gc
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from colorama import Fore, Style, init
from pprint import pprint

# 🚫 Suppressing warnings 🚫
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.dates as mdates
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

In [3]:
from scipy.stats import entropy
from collections import Counter
import polars as pl

In [4]:
from joblib import Parallel, delayed
from time import sleep, time
from multiprocessing import cpu_count

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb

# Load models

In [6]:
import pickle

#with open('/kaggle/input/xgboost-scaled/xgboost_scaled_features.pkl', 'rb') as f:
with open('/kaggle/input/seb-training-classic-ml-model-v4-output/ensembler_model_xgb_SCM_V4.pkl', 'rb') as f:
    model_xgb = pickle.load(f)
model_xgb

with open('/kaggle/input/seb-training-classic-ml-model-v4-output/ensembler_lgb_SCM_V4.pkl', 'rb') as f:
    model_lgb = pickle.load(f)
model_lgb

with open('/kaggle/input/seb-training-classic-ml-model-v4-output/ensembler_boosting_SCM_V4.pkl', 'rb') as f:
    model_gradboostclass = pickle.load(f)
model_gradboostclass

with open('/kaggle/input/seb-training-classic-ml-model-v4-output/ensembler_randomforest_SCM_V4.pkl', 'rb') as f:
    model_randomforest = pickle.load(f)
model_randomforest

with open('/kaggle/input/seb-training-classic-ml-model-v4-output/ensembler_model_catboost_SCM_V4.pkl', 'rb') as f:
    model_catboost = pickle.load(f)
model_catboost

with open('/kaggle/input/seb-training-classic-ml-model-v4-output/ensembler_model_extratrees_SCM_V4.pkl', 'rb') as f:
    model_extratrees = pickle.load(f)
model_extratrees

# Useful functions

In [7]:
class EnsembleAvgProba():
    
    def __init__(self, classifiers):
        
        self.classifiers = classifiers
    
    def fit(self,X,y):
        
        for classifier in self.classifiers:                
            classifier.fit(X, y)
            gc.collect()
     
    def predict_proba(self, X):
        
        probs = []
        
        for m in self.classifiers:
            probs.append(m.predict_proba(X))
        
        probabilities = np.stack(probs)
        p = np.mean(probabilities, axis=0)
        
        return p 
    
    def predict(self, X):
        
        probs = []
        
        for m in self.classifiers:
            probs.append(m.predict(X))
        
        probabilities = np.stack(probs)
        p = np.mean(probabilities, axis=0)
        
        return p.round()

In [8]:
def reduce_mem_usage(df):
    
    """ 
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and not is_datetime64_ns_dtype(df[col]) and not 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int32)  
            else:
                df[col] = df[col].astype(np.float16)
        
    return df

In [9]:
def feat_eng(df):
    
    df['series_id'] = df['series_id'].astype('category')
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df['hour'] = df["timestamp"].dt.hour
    
    df.sort_values(['timestamp'], inplace=True)
    df.set_index('timestamp', inplace=True)
    
    df['lids'] = np.maximum(0., df['enmo'] - 0.02)
    df['lids'] = df['lids'].rolling(f'{120*5}s', center=True, min_periods=1).agg('sum')
    df['lids'] = 100 / (df['lids'] + 1)
    df['lids'] = df['lids'].rolling(f'{360*5}s', center=True, min_periods=1).agg('mean').astype(np.float32)
    
    df["enmo"] = (df["enmo"]*1000).astype(np.int16)
    df["anglez"] = df["anglez"].astype(np.int16)
    df["anglezdiffabs"] = df["anglez"].diff().abs().astype(np.float32)
    
    for col in ['enmo', 'anglez', 'anglezdiffabs']:
        
        # periods in seconds        
        periods = [60, 360, 720, 3600] 
        
        for n in periods:
            
            rol_args = {'window':f'{n+5}s', 'min_periods':10, 'center':True}
            
            for agg in ['median', 'mean', 'max', 'min', 'var']:
                df[f'{col}_{agg}_{n}'] = df[col].rolling(**rol_args).agg(agg).astype(np.float32).values
                gc.collect()
            
            if n == max(periods):
                df[f'{col}_mad_{n}'] = (df[col] - df[f'{col}_median_{n}']).abs().rolling(**rol_args).median().astype(np.float32)
            
            df[f'{col}_amplit_{n}'] = df[f'{col}_max_{n}']-df[f'{col}_min_{n}']
            df[f'{col}_amplit_{n}_min'] = df[f'{col}_amplit_{n}'].rolling(**rol_args).min().astype(np.float32).values
            
#             if col in ['enmo', 'anglez']:
            df[f'{col}_diff_{n}_max'] = df[f'{col}_max_{n}'].diff().abs().rolling(**rol_args).max().astype(np.float32)
            df[f'{col}_diff_{n}_mean'] = df[f'{col}_max_{n}'].diff().abs().rolling(**rol_args).mean().astype(np.float32)

    
            gc.collect()
    
    df.reset_index(inplace=True)
    df.dropna(inplace=True)

    return df

In [10]:
file = '/kaggle/input/gammaa-updated-trained-series/train_series_07112023.parquet'

def feat_eng_by_id(idx):
    
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    
    df  = pd.read_parquet(file, filters=[('series_id','=',idx)])
    df = feat_eng(df)
    
    return df

file

'/kaggle/input/gammaa-updated-trained-series/train_series_07112023.parquet'

In [11]:
def get_events(idx, classifier, file='test_series.parquet') :
    
    test  = pd.read_parquet(f'/kaggle/input/child-mind-institute-detect-sleep-states/{file}',
                    filters=[('series_id','=',idx)])
    test = feat_eng(test)
    X_test = test.drop(columns=drop_cols)
    test = test[drop_cols]

    preds, probs = classifier.predict(X_test), classifier.predict_proba(X_test)[:, 1]
    
    test['prediction'] = preds
    test['prediction'] = test['prediction'].rolling(360+1, center=True).median()
    test['probability'] = probs
    
    test = test[test['prediction']!=2]
    
    test.loc[test['prediction']==0, 'probability'] = 1-test.loc[test['prediction']==0, 'probability']
    test['score'] = test['probability'].rolling(60*12*5, center=True, min_periods=10).mean().bfill().ffill()

    
    test['pred_diff'] = test['prediction'].diff()
    
    test['event'] = test['pred_diff'].replace({1:'wakeup', -1:'onset', 0:np.nan})
    
    test_wakeup = test[test['event']=='wakeup'].groupby(test['timestamp'].dt.date).agg('first')
    test_onset = test[test['event']=='onset'].groupby(test['timestamp'].dt.date).agg('last')
    test = pd.concat([test_wakeup, test_onset], ignore_index=True).sort_values('timestamp')

    return test

In [12]:
drop_cols = ['series_id', 'step', 'timestamp']

In [13]:
cols_sub = ['series_id','step','event','score']

# Generate submission file

In [14]:
%%time

series_id  = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet', columns=['series_id'])
series_id = series_id.series_id.unique()

CPU times: user 112 ms, sys: 14.3 ms, total: 126 ms
Wall time: 237 ms


In [15]:
model = EnsembleAvgProba(classifiers=[model_lgb,
                                      model_gradboostclass,
                                      model_randomforest,
                                      model_xgb,
                                      model_catboost,
                                      model_extratrees])

In [16]:
%%time

print("Number of jobs: ",int(cpu_count()))

tests = []

for idx in series_id: 

    test = get_events(idx, model)
    tests.append(test[cols_sub])

Number of jobs:  4
CPU times: user 28.2 s, sys: 21.8 ms, total: 28.2 s
Wall time: 28.2 s


In [17]:
submission = pd.concat(tests, ignore_index=True).reset_index(names='row_id')
submission.to_csv('submission.csv', index=False)

In [18]:
submission.head(5)

Unnamed: 0,row_id,series_id,step,event,score
