In [1]:
import pandas as pd 
import numpy as np 
import os, gc 
import time
import logging 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from tqdm.auto import tqdm
from itertools import combinations
import warnings
warnings.simplefilter('ignore')

In [2]:
train_df = pd.read_csv("train.csv",index_col='id')
test_df = pd.read_csv("test.csv")

train_df

Unnamed: 0_level_0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...
749995,male,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,female,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,male,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,male,45,182.0,91.0,17.0,102.0,40.3,109.0


In [3]:
le = LabelEncoder()

categorical_cols = ['Sex']
numerical_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

def feature_engineering(df,numerical_features):  
    
    for c in categorical_cols:
        #df.drop(c,axis=1,inplace=True)
        df[c] = le.fit_transform(df[c]) 
        """df[c] = df[c].astype('category') 
        df[c] = df[c].cat.codes"""

    for i in range(len(numerical_features)):
        for j in range(i + 1, len(numerical_features)):  
            feature1 = numerical_features[i]
            feature2 = numerical_features[j]
            cross_term_name = f"{feature1}_x_{feature2}"
            df[cross_term_name] = df[feature1] * df[feature2]

    gc.collect()
    return df


test_df = feature_engineering(test_df,numerical_cols)
train_df = feature_engineering(train_df, numerical_cols)

labels = np.log1p(train_df['Calories'])
train_df.drop('Calories',axis=1,inplace=True)
train_df.describe()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Age_x_Height,Age_x_Weight,Age_x_Duration,...,Height_x_Weight,Height_x_Duration,Height_x_Heart_Rate,Height_x_Body_Temp,Weight_x_Duration,Weight_x_Heart_Rate,Weight_x_Body_Temp,Duration_x_Heart_Rate,Duration_x_Body_Temp,Heart_Rate_x_Body_Temp
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,...,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,0.499039,41.420404,174.697685,75.145668,15.421015,95.483995,40.036253,7238.379235,3128.200032,640.729477,...,13299.557672,2690.8083,16679.229017,6993.894303,1156.387451,7174.893501,3008.292357,1541.562606,623.283247,3828.687447
std,0.499999,15.175049,12.824496,13.982704,8.354095,9.449845,0.779875,2712.869502,1334.431304,443.075437,...,3407.211385,1473.626587,2047.188593,526.939776,672.877571,1517.486807,561.697333,932.45348,343.646487,437.967454
min,0.0,20.0,126.0,36.0,1.0,67.0,37.1,2700.0,860.0,20.0,...,5289.0,135.0,9983.0,5027.4,45.0,3000.0,1450.8,67.0,37.1,2485.7
25%,0.0,28.0,164.0,63.0,8.0,88.0,39.6,4914.0,2046.0,300.0,...,10354.0,1440.0,15219.0,6568.9,600.0,5980.0,2526.3,728.0,317.6,3497.4
50%,0.0,40.0,174.0,74.0,15.0,95.0,40.3,6920.0,2912.0,550.0,...,12900.0,2669.0,16587.0,6987.2,1105.0,7029.0,2960.0,1455.0,606.0,3838.0
75%,1.0,52.0,185.0,87.0,23.0,103.0,40.7,9168.0,3978.0,884.0,...,16016.0,3933.0,18050.0,7402.9,1633.0,8272.0,3468.0,2323.0,931.5,4171.5
max,1.0,79.0,222.0,132.0,30.0,128.0,41.5,16748.0,9401.0,2370.0,...,28776.0,6540.0,26199.0,9168.6,3780.0,15129.0,5412.0,3840.0,1245.0,5286.4


In [4]:
def prep_submission(ids, preds):
    submission_df = pd.DataFrame({
        'id': ids,
        'preds': preds,
    })
    submission_df.to_csv("submission.csv", index=False)
    print("Submission file saved successfully!")

In [5]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold

X_train, y_train = train_df, labels 
oof_preds_xgb = np.zeros(len(X_train)) 

kf = KFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_trn, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_trn, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    print(f"\nFold: {fold}\n")

    model = XGBRegressor(
        device="cuda",
        max_depth=10,
        #min_child_weight=2,
        colsample_bytree=0.75,
        subsample=0.9,
        n_estimators=5500,
        learning_rate=0.02,
        gamma=0.01, 
        max_delta_step=2,
        early_stopping_rounds=100,
        eval_metric="rmse",
        enable_categorical=True
    )

    model.fit(
            X_trn, y_trn,
            eval_set=[(X_val, y_val)],
            verbose=100
    )
    
    # Predict on validation fold to get oofs
    oof_preds_xgb[val_idx] = model.predict(X_val)

# Save OOF predictions
np.save("xgb_oof_preds.npy", oof_preds_xgb)


Fold: 0

[0]	validation_0-rmse:0.94626
[100]	validation_0-rmse:0.14231
[200]	validation_0-rmse:0.06301
[300]	validation_0-rmse:0.06035
[400]	validation_0-rmse:0.06025
[500]	validation_0-rmse:0.06027
[514]	validation_0-rmse:0.06028

Fold: 1

[0]	validation_0-rmse:0.94245
[100]	validation_0-rmse:0.14070
[200]	validation_0-rmse:0.06138
[300]	validation_0-rmse:0.05879
[400]	validation_0-rmse:0.05874
[476]	validation_0-rmse:0.05875

Fold: 2

[0]	validation_0-rmse:0.95167
[100]	validation_0-rmse:0.14368
[200]	validation_0-rmse:0.06471
[300]	validation_0-rmse:0.06209
[400]	validation_0-rmse:0.06196
[500]	validation_0-rmse:0.06196
[519]	validation_0-rmse:0.06196

Fold: 3

[0]	validation_0-rmse:0.94194
[100]	validation_0-rmse:0.14123
[200]	validation_0-rmse:0.06204
[300]	validation_0-rmse:0.05936
[400]	validation_0-rmse:0.05928
[500]	validation_0-rmse:0.05930
[520]	validation_0-rmse:0.05930

Fold: 4

[0]	validation_0-rmse:0.94625
[100]	validation_0-rmse:0.14161
[200]	validation_0-rmse:0.06198


In [6]:
try:
    test_df_without_id = test_df.drop("id", axis=1)
except:
    test_df_without_id = test_df  


preds = model.predict(test_df_without_id)  
preds = np.expm1(preds)
prep_submission(ids=test_df["id"], preds=preds)

Submission file saved successfully!
