In [1]:
!pip install polars



In [3]:
# 📚 Importing necessary libraries 📊
import numpy as np              # NumPy for numerical operations
import pandas as pd             # Pandas for data manipulation
import plotly.express as px     # Plotly Express for interactive plotting
import matplotlib.pyplot as plt # Matplotlib for basic plotting
import seaborn as sns           # Seaborn for statistical data visualization
import random                   # Random for generating random numbers
import os                       # OS for interacting with the operating system
import gc                       # Garbage collector for memory management


In [4]:
from copy import deepcopy      # Deepcopy for creating deep copies of objects
from functools import partial  # Partial function application for function manipulation
from itertools import combinations  # Combinations for creating combinations of elements
from itertools import groupby  # Groupby for grouping elements in an iterable


In [5]:
from tqdm import tqdm          # tqdm for progress bars
#import polars as pl            # Polars for data manipulation
import datetime                # Datetime for date and time operations

In [6]:
# 📋 Define column names and tolerances for the score function 📋
column_names = {
    'series_id_column_name': 'series_id',
    'time_column_name': 'step',
    'event_column_name': 'event',
    'score_column_name': 'score',
}

tolerances = {
    'onset': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360], 
    'wakeup': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
}

# 📊 Setting display options for Pandas DataFrames 📊
pd.set_option('display.max_columns', None)  # Show all columns

# 🚫 Suppressing warnings 🚫
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [34]:
# 🧱 Importing specific functions and classes 🧱
from sklearn.model_selection import train_test_split  # Splitting data into training and testing sets
from sklearn.model_selection import StratifiedKFold, KFold  # Cross-validation techniques
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss  # Evaluation metrics
from sklearn.model_selection import cross_validate  # Cross-validation scoring
from sklearn.metrics import RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay, precision_score, average_precision_score  # Metrics and displays
import optuna  # Library for hyperparameter tuning
import xgboost as xgb  # XGBoost for gradient boosting
import lightgbm as lgb  # LightGBM for gradient boosting
from bayes_opt import BayesianOptimization
from sklearn.linear_model import LogisticRegression  # Logistic Regression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # Random Forest and Gradient Boosting
from sklearn.pipeline import Pipeline  # Pipeline for building a sequence of data transformations
from catboost import Pool  # CatBoost for gradient boosting
from sklearn.ensemble import BaggingClassifier, StackingClassifier

# ⚙️ Importing a custom metric function ⚙️#
#from metric import score  # Importing a custom event detection AP score function

In [32]:
!pip install score

Collecting score
  Using cached score-0.0.1a0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[6 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/private/var/folders/dw/v4qq90490sxd2_vtfl7dgzyc0000gn/T/pip-install-7kvrmjwf/score_d772ce2f6cb44d628c0fa00a5006fd0c/setup.py", line 4, in <module>
  [31m   [0m     from Cython.Build import cythonize
  [31m   [0m ModuleNotFoundError: No module named 'Cython'
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered

In [21]:
!pip install pip==21.3.1


Collecting pip==21.3.1
  Downloading pip-21.3.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.3.1
    Uninstalling pip-22.3.1:
      Successfully uninstalled pip-22.3.1
Successfully installed pip-21.3.1


In [27]:
import scipy
print(scipy.__version__)

1.10.0


In [31]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-23.2.1-py3-none-any.whl (2.1 MB)
     |████████████████████████████████| 2.1 MB 3.2 MB/s            
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.3.1
    Uninstalling pip-21.3.1:
      Successfully uninstalled pip-21.3.1
Successfully installed pip-23.2.1


In [None]:
import polars as pl            # Polars for data manipulation


# Column transformations for timestamp
dt_transforms = [
    pl.col('timestamp').str.to_datetime(),  # Convert timestamp to datetime
    (pl.col('timestamp').str.to_datetime().dt.year() - 2000).cast(pl.UInt8).alias('year'),  # Extract and cast year
    pl.col('timestamp').str.to_datetime().dt.month().cast(pl.UInt8).alias('month'),  # Extract and cast month
    pl.col('timestamp').str.to_datetime().dt.day().cast(pl.UInt8).alias('day'),  # Extract and cast day
    pl.col('timestamp').str.to_datetime().dt.hour().cast(pl.UInt8).alias('hour')  # Extract and cast hour
]

# Column transformations for data
data_transforms = [
    pl.col('anglez').cast(pl.Int16),  # Casting 'anglez' to 16-bit integer
    (pl.col('enmo') * 1000).cast(pl.UInt16)  # Convert 'enmo' to 16-bit unsigned integer
]

# Loading and transforming training series data
train_series = pl.scan_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet').with_columns(
    dt_transforms + data_transforms
)

# Loading and transforming training events data
train_events = pl.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv').with_columns(
    dt_transforms
)

# Loading and transforming test series data
test_series = pl.scan_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet').with_columns(
    dt_transforms + data_transforms
)

# Getting unique series IDs for convenience
series_ids = train_events['series_id'].unique(maintain_order=True).to_list()

# Removing series with mismatched event counts (onset vs. wakeup)
onset_counts = train_events.filter(pl.col('event') == 'onset').group_by('series_id').count().sort('series_id')['count']
wakeup_counts = train_events.filter(pl.col('event') == 'wakeup').group_by('series_id').count().sort('series_id')['count']

counts = pl.DataFrame({'series_id': sorted(series_ids), 'onset_counts': onset_counts, 'wakeup_counts': wakeup_counts})
count_mismatches = counts.filter(counts['onset_counts'] != counts['wakeup_counts'])

# Filtering out series with count mismatches
train_series = train_series.filter(~pl.col('series_id').is_in(count_mismatches['series_id']))
train_events = train_events.filter(~pl.col('series_id').is_in(count_mismatches['series_id']))

# Updating the list of series IDs, excluding series with no non-null values
series_ids = train_events.drop_nulls()['series_id'].unique(maintain_order=True).to_list()