In [None]:
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Avoid duplicate handlers
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)  # stdout works better than stderr in Jupyter
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler)

TRANSFORM_TARGET = True

In [None]:
import numpy as np
import cupy as cp
import time
import logging
from cuml.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error

# Logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Metric functions
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmsle(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, np.inf)
    y_true = np.clip(y_true, 0, np.inf)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# RAPIDS-based KNN Wrapper
class RapidsKNNWrapper:
    def __init__(self, model_params, metric_fn, name='rapids_knn'):
        self.model_params = model_params or {}
        self.metric_fn = metric_fn
        self.name = name
        self.models = []
        self.oof_preds = None

    def fit(self, X, y, folds=5):
        start_time = time.time()
        logger.info(f"Starting training of {self.name} model with {folds} folds")

        X_np = X.astype(np.float32)
        y_np = y.values.astype(np.float32) if hasattr(y, 'values') else y.astype(np.float32)
        self.oof_preds = np.zeros(len(X_np), dtype=np.float32)

        kf = KFold(n_splits=folds, shuffle=True, random_state=42)

        for fold, (train_idx, val_idx) in enumerate(kf.split(X_np), 1):
            fold_start = time.time()
            logger.info(f"Training {self.name} - Fold {fold}/{folds}")

            X_train, X_val = X_np.iloc[train_idx], X_np.iloc[val_idx]
            y_train, y_val = y_np[train_idx], y_np[val_idx]

            # Convert to CuPy
            X_train_cu, X_val_cu = cp.asarray(X_train), cp.asarray(X_val)
            y_train_cu = cp.asarray(y_train)

            model = KNeighborsRegressor(**self.model_params)
            model.fit(X_train_cu, y_train_cu)

            preds = model.predict(X_val_cu).get()  # convert back to NumPy
            self.oof_preds[val_idx] = preds

            score = self.metric_fn(y_val, preds)
            logger.info(f"Fold {fold} {self.metric_fn.__name__.upper()}: {score:.4f}")

            self.models.append(model)
            logger.info(f"Completed Fold {fold} in {time.time() - fold_start:.2f} seconds")

        total_score = self.metric_fn(y_np, self.oof_preds)
        logger.info(f"Out-of-fold {self.metric_fn.__name__.upper()}: {total_score:.4f}")
        logger.info(f"Completed training of {self.name} in {time.time() - start_time:.2f} seconds")

    def predict(self, X):
        X_cu = cp.asarray(X.astype(np.float32))
        preds = [model.predict(X_cu).get() for model in self.models]
        return np.mean(np.column_stack(preds), axis=1)


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Encode Sex
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})

# Log1p transform the target
df_train['Calories'] = np.log1p(df_train['Calories'])

# Base features
df_train['BMI'] = df_train['Weight'] / ((df_train['Height']/100) ** 2)
df_test['BMI'] = df_test['Weight'] / ((df_test['Height']/100) ** 2)

df_train['Exercise_Intensity'] = df_train['Heart_Rate'] / df_train['Age']
df_test['Exercise_Intensity'] = df_test['Heart_Rate'] / df_test['Age']

df_train['Duration_per_BMI'] = df_train['Duration'] / df_train['BMI']
df_test['Duration_per_BMI'] = df_test['Duration'] / df_test['BMI']

df_train['BMR'] = 10 * df_train['Weight'] + 6.25 * (df_train['Height']/100) - 5 * df_train['Age'] + 5
df_test['BMR'] = 10 * df_test['Weight'] + 6.25 * (df_test['Height']/100) - 5 * df_test['Age'] + 5

df_train['Body_Fat_Percentage'] = (1.2 * df_train['BMI']) + (0.23 * df_train['Age']) - (10.8 * df_train['Sex']) - 5.4
df_test['Body_Fat_Percentage'] = (1.2 * df_test['BMI']) + (0.23 * df_test['Age']) - (10.8 * df_test['Sex']) - 5.4

df_train['Resting_Heart_Rate'] = 0.7 * df_train['Heart_Rate'] + (0.3 * df_train['Age']) - 5.4
df_test['Resting_Heart_Rate'] = 0.7 * df_test['Heart_Rate'] + (0.3 * df_test['Age']) - 5.4

# Interactions
df_train['Height_x_Age'] = df_train['Height'] * df_train['Age']
df_test['Height_x_Age'] = df_test['Height'] * df_test['Age']

df_train['Weight_x_Duration'] = df_train['Weight'] * df_train['Duration']
df_test['Weight_x_Duration'] = df_test['Weight'] * df_test['Duration']

df_train['HeartRate_x_Duration'] = df_train['Heart_Rate'] * df_train['Duration']
df_test['HeartRate_x_Duration'] = df_test['Heart_Rate'] * df_test['Duration']

df_train['BMI_x_Duration'] = df_train['BMI'] * df_train['Duration']
df_test['BMI_x_Duration'] = df_test['BMI'] * df_test['Duration']

df_train['ExerciseIntensity_x_Duration'] = df_train['Exercise_Intensity'] * df_train['Duration']
df_test['ExerciseIntensity_x_Duration'] = df_test['Exercise_Intensity'] * df_test['Duration']

df_train['BMR_x_Duration'] = df_train['BMR'] * df_train['Duration']
df_test['BMR_x_Duration'] = df_test['BMR'] * df_test['Duration']

df_train['HeartRate_per_Weight'] = df_train['Heart_Rate'] / df_train['Weight']
df_test['HeartRate_per_Weight'] = df_test['Heart_Rate'] / df_test['Weight']

df_train['HeartRate_per_Duration'] = df_train['Heart_Rate'] / df_train['Duration']
df_test['HeartRate_per_Duration'] = df_test['Heart_Rate'] / df_test['Duration']

df_train['HeartRate_x_Duration2'] = df_train['Heart_Rate'] * (df_train['Duration'] ** 2)
df_test['HeartRate_x_Duration2'] = df_test['Heart_Rate'] * (df_test['Duration'] ** 2)

df_train['BMI_minus_Age'] = df_train['BMI'] - df_train['Age']
df_test['BMI_minus_Age'] = df_test['BMI'] - df_test['Age']

df_train['log_HeartRate'] = np.log1p(df_train['Heart_Rate'])
df_test['log_HeartRate'] = np.log1p(df_test['Heart_Rate'])

# PCA feature (Heart Rate + Duration)
pca = PCA(n_components=1)
df_train['HR_Duration_PCA1'] = pca.fit_transform(df_train[['Heart_Rate', 'Duration']])
df_test['HR_Duration_PCA1'] = pca.transform(df_test[['Heart_Rate', 'Duration']])

# 3. PCA of metabolic signals
metabolic_cols = ['BMR', 'Exercise_Intensity', 'Body_Temp']
pca_metab = PCA(n_components=1)
df_train['PCA_Metabolic_1'] = pca_metab.fit_transform(df_train[metabolic_cols])
df_test['PCA_Metabolic_1'] = pca_metab.transform(df_test[metabolic_cols])

# Select top features
selected_features = [
    'PCA_Metabolic_1',
    'Resting_Heart_Rate',
    'Body_Fat_Percentage',
    'HeartRate_x_Duration',
    'HR_Duration_PCA1',
    'HeartRate_per_Duration',
    'Heart_Rate',
    'HeartRate_x_Duration2',
    'Height_x_Age',
    'Weight',
    'Height',
    'Weight_x_Duration',
    'ExerciseIntensity_x_Duration',
    'Duration_per_BMI',
    'HeartRate_per_Weight',
    'BMI_x_Duration',
    'BMI_minus_Age',
    'BMI',
    'log_HeartRate',
    'BMR_x_Duration',
    'BMR',
    'Body_Temp',
]

# Apply StandardScaler only to selected features
scaler = StandardScaler()
df_train[selected_features] = scaler.fit_transform(df_train[selected_features])
df_test[selected_features] = scaler.transform(df_test[selected_features])

# Final train set
X_train = df_train[selected_features]
y_train = df_train['Calories']


In [None]:
model = RapidsKNNWrapper(
    model_params={
        'n_neighbors': 25,
        'metric': 'manhattan',
        'algorithm': 'brute',  # or 'auto', 'ball_tree', etc.
    },
    metric_fn=rmse  # or rmse
)

model.fit(X_train, y_train, folds=3)

In [None]:
np.save('knn1_oof_preds.npy', model.oof_preds)


In [None]:
# Save the 'id' column before dropping it
test_ids = df_test['id']

# Drop 'id' column before prediction
df_test = df_test.drop('id', axis=1)

# Make predictions on test data
test_preds = model.predict(df_test[selected_features])

test_preds = np.expm1(test_preds)

# # Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'Calories': test_preds
})

# Save submission file
submission.to_csv('knn1_submission.csv', index=False)