## Imports and Set Up

In [51]:
!pip install pytorch-tabnet



In [52]:
import os
import random
import warnings
from concurrent.futures import ThreadPoolExecutor

from scipy.optimize import minimize

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from colorama import Fore, Style
from IPython.display import clear_output
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import (accuracy_score, cohen_kappa_score,
                             confusion_matrix, f1_score, mean_absolute_error,
                             mean_squared_error, precision_score, recall_score,
                             classification_report)
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

In [53]:
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [54]:
def set_seed(seed_value=2024):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.backends.cudnn.deterministic = True

set_seed(2024)

## Data Processing

### Load in Files

In [55]:
TRAIN_CSV = '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'
TEST_CSV = '/kaggle/input/child-mind-institute-problematic-internet-use/test.csv'
SAMPLE_SUBMISSION_CSV = '/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv'
SERIES_TRAIN_DIR = '/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet'
SERIES_TEST_DIR = '/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet'

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION_CSV)

# Drop all the PCIAT variables as they are not present in the test data
for col in train_df.columns:
    if 'PCIAT' in col:
        train_df.drop(col, axis=1, inplace=True)

In [56]:
# Function to process individual time series files
def process_time_series(file_name, directory):
    df = pd.read_parquet(os.path.join(directory, file_name, 'part-0.parquet'))
    df = df.drop('step', axis=1)
    stats = df.describe().values.flatten()
    record_id = file_name.split('=')[1]
    return stats, record_id

In [57]:
# Function to load and aggregate time series data
def load_time_series_data(directory):
    file_names = os.listdir(directory)
    stats_list = []
    ids_list = []

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_time_series(fname, directory), file_names),
                            total=len(file_names)))

    for stats, record_id in results:
        stats_list.append(stats)
        ids_list.append(record_id)

    stats_df = pd.DataFrame(stats_list, columns=[f'stat_{i}' for i in range(len(stats_list[0]))])
    stats_df['id'] = ids_list
    return stats_df

In [58]:
train_series_df = load_time_series_data(SERIES_TRAIN_DIR)
test_series_df = load_time_series_data(SERIES_TEST_DIR)

100%|██████████| 996/996 [01:09<00:00, 14.39it/s]
100%|██████████| 2/2 [00:00<00:00, 10.13it/s]


### Encoding of Time Series Data

In [59]:
class TimeSeriesAutoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(TimeSeriesAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, encoding_size * 3),
            nn.ReLU(),
            nn.Linear(encoding_size * 3, encoding_size * 2),
            nn.ReLU(),
            nn.Linear(encoding_size * 2, encoding_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_size, input_size * 2),
            nn.ReLU(),
            nn.Linear(input_size * 2, input_size * 3),
            nn.ReLU(),
            nn.Linear(input_size * 3, input_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [60]:
def get_encoded_features(df, encoding_dim=60, epochs=100, batch_size=32):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    tensor_data = torch.FloatTensor(scaled_data)
    input_dim = tensor_data.shape[1]

    autoencoder = TimeSeriesAutoencoder(input_dim, encoding_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())

    for epoch in range(epochs):
        for i in range(0, len(tensor_data), batch_size):
            batch = tensor_data[i:i + batch_size]
            optimizer.zero_grad()
            outputs = autoencoder(batch)
            loss = criterion(outputs, batch)
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')

    with torch.no_grad():
        encoded_data = autoencoder.encoder(tensor_data).numpy()

    encoded_df = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    return encoded_df

In [61]:
train_series_features = train_series_df.drop('id', axis=1)
test_series_features = test_series_df.drop('id', axis=1)

train_encoded = get_encoded_features(train_series_features)
test_encoded = get_encoded_features(test_series_features)

train_encoded['id'] = train_series_df['id']
test_encoded['id'] = test_series_df['id']

train_df = train_df.merge(train_encoded, on='id', how='left')
test_df = test_df.merge(test_encoded, on='id', how='left')

Epoch [10/100], Loss: 1.6710
Epoch [20/100], Loss: 1.5469
Epoch [30/100], Loss: 1.5154
Epoch [40/100], Loss: 1.4932
Epoch [50/100], Loss: 1.4964
Epoch [60/100], Loss: 1.4920
Epoch [70/100], Loss: 1.4309
Epoch [80/100], Loss: 1.4185
Epoch [90/100], Loss: 1.3667
Epoch [100/100], Loss: 1.3620
Epoch [10/100], Loss: 1.0070
Epoch [20/100], Loss: 0.5783
Epoch [30/100], Loss: 0.4271
Epoch [40/100], Loss: 0.4271
Epoch [50/100], Loss: 0.4271
Epoch [60/100], Loss: 0.4271
Epoch [70/100], Loss: 0.4271
Epoch [80/100], Loss: 0.4271
Epoch [90/100], Loss: 0.4271
Epoch [100/100], Loss: 0.4271


### Imputation of Missing Numerical Values

In [62]:
# Imputing missing values using KNN imputer
def impute_missing_values(df):
    imputer = KNNImputer(n_neighbors=5)
    numeric_columns = df.select_dtypes(include=['float64','float32','int64']).columns # We treat the time series data as a median for now but others could be used
    imputed_array = imputer.fit_transform(df[numeric_columns])
    imputed_df = pd.DataFrame(imputed_array, columns=numeric_columns)
    for col in df.columns:
        if col not in numeric_columns:
            imputed_df[col] = df[col]
    return imputed_df

In [63]:
train_df.replace([np.inf, -np.inf], np.nan, inplace=True) # Debug why we have inf values

train_df = impute_missing_values(train_df)
train_df['sii'] = train_df['sii'].round().astype(int)

# Imputation is needed in test set for some cases but not others to revisit

print(train_df.isna().sum())

Basic_Demos-Age           0
Basic_Demos-Sex           0
CGAS-CGAS_Score           0
Physical-BMI              0
Physical-Height           0
                       ... 
BIA-Season             1815
PAQ_A-Season           3485
PAQ_C-Season           2239
SDS-Season             1342
PreInt_EduHx-Season     420
Length: 120, dtype: int64


### Categorical Processing

In [64]:
categorical_cols = list(train_df.select_dtypes(include=['object']).columns) #sii (outcome var) is categorical but we are encoding that differently
categorical_cols.remove('id')
print(categorical_cols)

def preprocess_categorical(df):
    for col in categorical_cols:
        df[col] = df[col].fillna('Missing').astype('category')
    return df

train_df = preprocess_categorical(train_df)
test_df = preprocess_categorical(test_df)


train_df = pd.get_dummies(train_df, columns = categorical_cols, drop_first=True, dtype='int')
test_df = pd.get_dummies(test_df, columns = categorical_cols, drop_first=True, dtype='int')

['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']


In [65]:
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

## Tabnet Classifier

In [66]:
# Define the Quadratic Weighted Kappa metric
def quadratic_weighted_kappa(y_actual, y_predicted):
    return cohen_kappa_score(y_actual, y_predicted, weights='quadratic')

# Function to apply thresholds to continuous predictions
def apply_thresholds(predictions, thresholds):
    return np.digitize(predictions, bins=thresholds)

# Function to optimize thresholds to maximize QWK
def optimize_thresholds(y_true, predictions):
    def loss_func(thresh):
        # Ensure thresholds are sorted
        thresh_sorted = np.sort(thresh)
        preds = apply_thresholds(predictions, thresh_sorted)
        return -quadratic_weighted_kappa(y_true, preds)
    
    initial_thresholds = [0.5, 1.5, 2.5]  # Initial guesses for thresholds
    bounds = [(0, 3)] * 3  # Assuming classes are 0,1,2,3
    result = minimize(loss_func, initial_thresholds, method='Nelder-Mead')
    return result.x

def train_and_evaluate_tabnet():
    X = train_df.drop('sii', axis=1).values  # Ensure conversion to NumPy array
    y = train_df['sii'].astype(int).values  # Ensure conversion to NumPy array
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    oof_predictions = np.zeros(len(y))
    train_kappas = []
    val_kappas = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Training fold {fold + 1}")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Check for NaNs and Infs
        assert not np.any(np.isnan(X_train)), "NaN values in X_train"
        assert not np.any(np.isinf(X_train)), "Inf values in X_train"
        assert not np.any(np.isnan(y_train)), "NaN values in y_train"
        assert not np.any(np.isinf(y_train)), "Inf values in y_train"

        assert not np.any(np.isnan(X_val)), "NaN values in X_val"
        assert not np.any(np.isinf(X_val)), "Inf values in X_val"
        assert not np.any(np.isnan(y_val)), "NaN values in y_val"
        assert not np.any(np.isinf(y_val)), "Inf values in y_val"

        # Instantiate a fresh TabNet model for each fold without multi_class
        tabnet_model = TabNetClassifier(
            n_d=64,
            n_a=64,
            n_steps=5,
            gamma=1.3,
            lambda_sparse=1e-3,
            seed=2024,
            verbose=1,
            device_name='auto'
        )

        # Fit the model with 'logloss' as eval_metric
        tabnet_model.fit(
            X_train=X_train,
            y_train=y_train,
            eval_set=[(X_val, y_val)],
            eval_name=['validation'],
            eval_metric=['logloss'],
            max_epochs=100,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False
        )

        # Predict class probabilities
        y_train_proba = tabnet_model.predict_proba(X_train)
        y_val_proba = tabnet_model.predict_proba(X_val)

        # Compute expected values
        classes = tabnet_model.classes_
        expected_train = np.dot(y_train_proba, classes)
        expected_val = np.dot(y_val_proba, classes)

        oof_predictions[val_idx] = expected_val

        # For training kappa, use class predictions
        y_train_pred = tabnet_model.predict(X_train)
        y_val_pred = tabnet_model.predict(X_val)

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred)
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred)

        train_kappas.append(train_kappa)
        val_kappas.append(val_kappa)

        print(f"Fold {fold + 1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")

    print(f"Average Train QWK: {np.mean(train_kappas):.4f}")
    print(f"Average Validation QWK: {np.mean(val_kappas):.4f}")

    # Optimize thresholds on out-of-fold predictions
    optimal_thresholds = optimize_thresholds(y, oof_predictions)
    print(f"Optimized Thresholds: {optimal_thresholds}")

    # Apply thresholds to get final class predictions
    final_oof_predictions = apply_thresholds(oof_predictions, optimal_thresholds)
    final_kappa = quadratic_weighted_kappa(y, final_oof_predictions)
    print(f"Final Optimized QWK: {Fore.CYAN}{Style.BRIGHT}{final_kappa:.4f}{Style.RESET_ALL}")

In [67]:
# Train and evaluate the TabNet model
train_and_evaluate_tabnet()

Training fold 1
epoch 0  | loss: 1.73061 | validation_logloss: 5.82454 |  0:00:00s
epoch 1  | loss: 1.52725 | validation_logloss: 6.8569  |  0:00:00s
epoch 2  | loss: 1.38984 | validation_logloss: 6.04811 |  0:00:00s
epoch 3  | loss: 1.29843 | validation_logloss: 4.21407 |  0:00:01s
epoch 4  | loss: 1.15212 | validation_logloss: 2.50755 |  0:00:01s
epoch 5  | loss: 1.09846 | validation_logloss: 2.74389 |  0:00:01s
epoch 6  | loss: 1.0482  | validation_logloss: 3.40865 |  0:00:01s
epoch 7  | loss: 0.99975 | validation_logloss: 3.30766 |  0:00:02s
epoch 8  | loss: 0.97599 | validation_logloss: 2.51032 |  0:00:02s
epoch 9  | loss: 1.00769 | validation_logloss: 2.79812 |  0:00:02s
epoch 10 | loss: 1.00574 | validation_logloss: 1.51334 |  0:00:02s
epoch 11 | loss: 0.93997 | validation_logloss: 1.95527 |  0:00:03s
epoch 12 | loss: 0.92701 | validation_logloss: 2.01692 |  0:00:03s
epoch 13 | loss: 0.91967 | validation_logloss: 1.56756 |  0:00:03s
epoch 14 | loss: 0.95559 | validation_logloss: