##### Importing data

In [None]:
import numpy as np
import pandas as pd
from itertools import groupby
from sklearn.model_selection import train_test_split
from pandas.api.types import is_datetime64_ns_dtype

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from joblib import Parallel, delayed
import gc
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

from metric import score # Import event detection ap score function

# These are variables to be used by the score function
column_names = {
    'series_id_column_name': 'series_id',
    'time_column_name': 'step',
    'event_column_name': 'event',
    'score_column_name': 'score',
}

tolerances = {
    'onset': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360], 
    'wakeup': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
}

In [None]:
from tqdm.auto import tqdm 
from joblib import Parallel, delayed
from time import sleep, time
from multiprocessing import cpu_count

In [None]:
def reduce_mem_usage(df):
    
    """ 
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and not is_datetime64_ns_dtype(df[col]) and not 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int32)  
            else:
                df[col] = df[col].astype(np.float16)
        
    return df

In [None]:
def feat_eng(df):
    
    df['series_id'] = df['series_id'].astype('category')
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df['hour'] = df["timestamp"].dt.hour
    
    df.sort_values(['timestamp'], inplace=True)
    df.set_index('timestamp', inplace=True)
    
    df['lids'] = np.maximum(0., df['enmo'] - 0.02)
    df['lids'] = df['lids'].rolling(f'{120*5}s', center=True, min_periods=1).agg('sum')
    df['lids'] = 100 / (df['lids'] + 1)
    df['lids'] = df['lids'].rolling(f'{360*5}s', center=True, min_periods=1).agg('mean').astype(np.float32)
    
    df["enmo"] = (df["enmo"]*1000).astype(np.int16)
    df["anglez"] = df["anglez"].astype(np.int16)
    df["anglezdiffabs"] = df["anglez"].diff().abs().astype(np.float32)
    
    for col in ['enmo', 'anglez', 'anglezdiffabs']:
        
        # periods in seconds        
        periods = [60, 360, 720, 3600] 
        
        for n in periods:
            
            rol_args = {'window':f'{n+5}s', 'min_periods':10, 'center':True}
            
            for agg in ['median', 'mean', 'max', 'min', 'var']:
                df[f'{col}_{agg}_{n}'] = df[col].rolling(**rol_args).agg(agg).astype(np.float32).values
                gc.collect()
            
            if n == max(periods):
                df[f'{col}_mad_{n}'] = (df[col] - df[f'{col}_median_{n}']).abs().rolling(**rol_args).median().astype(np.float32)
            
            df[f'{col}_amplit_{n}'] = df[f'{col}_max_{n}']-df[f'{col}_min_{n}']
            df[f'{col}_amplit_{n}_min'] = df[f'{col}_amplit_{n}'].rolling(**rol_args).min().astype(np.float32).values
            
#             if col in ['enmo', 'anglez']:
            df[f'{col}_diff_{n}_max'] = df[f'{col}_max_{n}'].diff().abs().rolling(**rol_args).max().astype(np.float32)
            df[f'{col}_diff_{n}_mean'] = df[f'{col}_max_{n}'].diff().abs().rolling(**rol_args).mean().astype(np.float32)

    
            gc.collect()
    
    df.reset_index(inplace=True)
    df.dropna(inplace=True)

    return df

In [None]:
file = '/kaggle/input/gamma-train-series-updated-11-11-2023/train_series_10112023.parquet'

def feat_eng_by_id_train(idx):
    
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    
    df  = pd.read_parquet(file, filters=[('series_id','=',idx)])
    df = df[(df['night']<6)]
    df = feat_eng(df)
    
    return df

def feat_eng_by_id(idx):
    
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    
    df  = pd.read_parquet(file, filters=[('series_id','=',idx)])
    df = feat_eng(df)
    
    return df

file

## Training and validating 

In [None]:
DEV = False

series_id  = pd.read_parquet(file, columns=['series_id'])
series_id = series_id.series_id.unique()

print(len(series_id))

if DEV:
    series_id = series_id[::10]

In [None]:
series_id

In [None]:
series_id_train, series_id_test = train_test_split(series_id, 
                                   random_state=1234,  
                                   test_size=0.25,  
                                   shuffle=True)

#series_id_val, series_id_test = train_test_split(series_id_test, 
#                                   random_state=1234,  
#                                   test_size=0.50,  
#                                   shuffle=True)

In [None]:
len(series_id_train)

In [None]:
len(series_id_test)

In [None]:
df  = pd.read_parquet(file,columns=['series_id','night'])
df=df[df['series_id'].isin(series_id_train)]
df['series_id'].nunique()
len(df)
len(df[(df['night']<5)])
df2=df[(df['night']<6)]
df2['series_id'].nunique()
series_id_train = df2.series_id.unique()

In [None]:
len(series_id_train)

In [None]:
nights_per_series=df2.groupby(['series_id'])['night'].nunique().reset_index()
nights_per_series.groupby('night').size()

# take max 5 nights per person

In [None]:
series_id_train2=series_id_train[90:95,]
len(series_id_train2)

In [None]:
%%time

import joblib

print("Number of jobs: ",int(cpu_count()))

train_df = []

for idx in tqdm(series_id_train): 

    test = feat_eng_by_id_train(idx)
    
    test = test.iloc[::60]

    train_df.append(test)
    
train = pd.concat(train_df, ignore_index=True).reset_index(names='new_row_id')

len(train)

In [None]:
#%%time
#train  = pd.read_parquet('/kaggle/input/gammaa-train-test-validation-series/train_set_with_variables.parquet')
#len(train)

In [None]:
train.groupby('event').size()

In [None]:
train.groupby('event_2').size()

In [None]:
train.loc[train['event']=='onset','event'] = 0
train.loc[train['event']=='wakeup','event'] = 1

In [None]:
train.groupby('event').size()

In [None]:
drop_cols = ['series_id', 'step', 'timestamp','night','row_id','new_row_id','event_2']

X, y = train.drop(columns=drop_cols+['event']), train['event']

y=y.astype('int')

gc.collect()

In [None]:
if not DEV:
    del train
    gc.collect()

## Ensemble

In [None]:
class EnsembleAvgProba():
    
    def __init__(self, classifiers):
        
        self.classifiers = classifiers
    
    def fit(self,X,y):
        
        for classifier in self.classifiers:                
            classifier.fit(X, y)
            gc.collect()
     
    def predict_proba(self, X):
        
        probs = []
        
        for m in self.classifiers:
            probs.append(m.predict_proba(X))
        
        probabilities = np.stack(probs)
        p = np.mean(probabilities, axis=0)
        
        return p 
    
    def predict(self, X):
        
        probs = []
        
        for m in self.classifiers:
            probs.append(m.predict(X))
        
        probabilities = np.stack(probs)
        p = np.mean(probabilities, axis=0)
        
        return p.round()

In [None]:
##테스트 버전, 기존에 Catboost + ExtraTree
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb

lgb_params1 = {    
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'max_depth': 6,
    'learning_rate': 0.03,
    'n_estimators': 850,
    'subsample_for_bin': 200000,
    'min_child_weight': 0.001,
    'min_child_samples': 20,
    'subsample': 0.9,
    'colsample_bytree': 0.7,  # Uncommented this line
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
}

xgb_params = {
    'n_estimators': 520,
    'objective': "binary:logistic",
    'learning_rate': 0.02,
    'max_depth': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'random_state': 42
}

cat_params = {
    'iterations': 600,
    'learning_rate': 0.03,
    'depth': 6,
    'random_state': 42,
    'verbose': 0  # to prevent training output, remove or set to a larger value to see training progress
}

In [None]:
%%time
model_lgb=lgb.LGBMClassifier(random_state=42, **lgb_params1)
model_lgb.fit(X, y)

In [None]:
%%time
model_gradboostclass=GradientBoostingClassifier(n_estimators=100, max_depth=5, min_samples_leaf=300, random_state=42)
model_gradboostclass.fit(X, y)

In [None]:
%%time
model_randomforest=RandomForestClassifier(n_estimators=500, min_samples_leaf=300, random_state=42, n_jobs=-1)
model_randomforest.fit(X, y)

In [None]:
%%time
model_xgb=xgb.XGBClassifier(**xgb_params)
model_xgb.fit(X, y)

In [None]:
%%time
model_catboost=CatBoostClassifier(**cat_params)
model_catboost.fit(X, y)

In [None]:
%%time
model_extratrees=ExtraTreesClassifier(n_estimators=500, min_samples_leaf=300, random_state=42, n_jobs=-1)
model_extratrees.fit(X, y)

In [None]:
%%time
# Saving classifier 
import pickle
with open('ensembler_lgb_SCM_V4.pkl', 'wb') as f:
    pickle.dump(model_lgb, f)

with open('ensembler_boosting_SCM_V4.pkl', 'wb') as f:
    pickle.dump(model_gradboostclass, f)
    
with open('ensembler_randomforest_SCM_V4.pkl', 'wb') as f:
    pickle.dump(model_randomforest, f)
    
with open('ensembler_model_xgb_SCM_V4.pkl', 'wb') as f:
    pickle.dump(model_xgb, f)
    
with open('ensembler_model_catboost_SCM_V4.pkl', 'wb') as f:
    pickle.dump(model_catboost, f)
    
with open('ensembler_model_extratrees_SCM_V4.pkl', 'wb') as f:
    pickle.dump(model_extratrees, f)
    

In [None]:
model = EnsembleAvgProba(classifiers=[model_lgb,
                                      model_gradboostclass,
                                      model_randomforest,
                                      model_xgb,
                                      model_catboost,
                                      model_extratrees])

In [None]:
1+1

In [None]:
# del X, y
gc.collect()

# Test the model on the test ids! and see how it goes

In [None]:
len(series_id_test)

In [None]:
#series_id_test2=[series_id_test[0],series_id_test[1]]
series_id_test2=['d93b0c7de16b','062dbd4c95e6','44a41bba1ee7']
series_id_test2

In [None]:
def get_events(idx, classifier, file='test_series.parquet') :
    
    #test  = pd.read_parquet(f'/kaggle/input/child-mind-institute-detect-sleep-states/{file}',
    #                filters=[('series_id','=',idx)])
    
    test  = pd.read_parquet('/kaggle/input/gamma-train-series-updated-11-11-2023/train_series_10112023.parquet',
                    filters=[('series_id','=',idx)])
    
    test = feat_eng(test)
    X_test = test.drop(columns=drop_cols2)
    test = test[drop_cols2]
    
    #X_test = scaler.transform(X_test)

    preds, probs = classifier.predict(X_test), classifier.predict_proba(X_test)[:, 1]
    
    test['prediction'] = preds
    test['prediction'] = test['prediction'].rolling(360+1, center=True).median()
    test['probability'] = probs
    
    test = test[test['prediction']!=2]
    
    test.loc[test['prediction']==0, 'probability'] = 1-test.loc[test['prediction']==0, 'probability']
    test['score'] = test['probability'].rolling(60*12*5, center=True, min_periods=10).mean().bfill().ffill()

    
    test['pred_diff'] = test['prediction'].diff()
    
    test['event'] = test['pred_diff'].replace({1:'wakeup', -1:'onset', 0:np.nan})
    
    test_wakeup = test[test['event']=='wakeup'].groupby(test['timestamp'].dt.date).agg('first')
    test_onset = test[test['event']=='onset'].groupby(test['timestamp'].dt.date).agg('last')
    test = pd.concat([test_wakeup, test_onset], ignore_index=True).sort_values('timestamp')

    return test

In [None]:
cols_sub = ['series_id','step','event','score']
drop_cols2 = ['series_id', 'step', 'timestamp','night','row_id','event','event_2']

tests = []

#for idx in tqdm(series_id_test2): 
for idx in tqdm(series_id_test): 

    test = get_events(idx, model)
    tests.append(test[cols_sub])

In [None]:
events_submission = pd.concat(tests, ignore_index=True).reset_index(names='row_id')
len(events_submission)

In [None]:
%%time
import polars as pl
train_events = (pl.scan_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

In [None]:
val_solution = train_events[train_events['series_id'].isin(series_id_test)][['series_id', 'event', 'step']]
#val_solution = train_events[train_events['series_id'].isin(series_id_test2)][['series_id', 'event', 'step']]
val_solution = val_solution[val_solution['step'].notna()]
val_solution = val_solution.reset_index(drop=True)
val_solution = val_solution.reset_index().rename(columns={'index': 'row_id'})

#xgb_submission=xgb_submission[(xgb_submission['score']>0.1)]

#for the weird case
#print(f"Model score: {score(val_solution, events_submission[(events_submission['row_id']<26)],tolerances, **column_names)}")
print(f"Model score: {score(val_solution, events_submission,tolerances, **column_names)}")

In [None]:
#for i in series_id_test2:
for i in series_id_test:
    print(f"Model score - {i} : {score(val_solution[(val_solution['series_id']==i)],events_submission[(events_submission['series_id']==i)],tolerances, **column_names)}")

In [None]:
events_submission