In [1]:
import numpy as np
from sklearn.metrics import log_loss,accuracy_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import pandas as pd 
import xgboost as xgb
import json
from collections import Counter

import gc
from tqdm import tqdm
from itertools import combinations

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("train.csv",index_col='id')
test_df = pd.read_csv("test.csv")

train_df

Unnamed: 0_level_0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...
749995,male,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,female,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,male,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,male,45,182.0,91.0,17.0,102.0,40.3,109.0


In [3]:
le = LabelEncoder()

categorical_cols = ['Sex']
all_cols = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

def feature_engineering(df):  
    
    for c in categorical_cols:
        #df.drop(c,axis=1,inplace=True)
        #df[c] = le.fit_transform(df[c]) # Converts categorical column into int format
        df[c] = df[c].astype('category') # Define column type as category """
        df[c] = df[c].cat.codes

    df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
    df['Intensity'] = df['Heart_Rate'] / df['Duration']
    
    #df.drop(['Age','Height','Weight'],axis=1,inplace=True)
    #df["Body_Temp"] = np.expm1(df["Body_Temp"])
   
    #df["Body_Temp"] = df["Body_Temp"]

    df['Age_bin6'] = pd.qcut(df['Age'], q=6, labels=[1, 2, 3, 4, 5, 6]).astype(int)
    df['Height_log10p'] = np.log10(df['Height']).astype(float)
    df['Sex_Weight_prod'] = df['Sex'] * df['Weight']
    df['Sex_Age_log10p_sum'] = df['Sex'] + np.log10(df['Age']).astype(float)
    df['Sex_Age_bin6_sum'] = df['Sex'] + pd.qcut(df['Age'], q=6, labels=[1, 2, 3, 4, 5, 6]).astype(int)
    df['Sex_Duration_log10p_prod'] = df['Sex'] * np.log10(df['Duration']).astype(float)
    df['Sex_Body_Temp_log10p_prod'] = df['Sex'] * np.log10(df['Body_Temp']).astype(float)
    df['Age_Age_bin6_ratio'] = df['Age'] / pd.qcut(df['Age'], q=6, labels=[1, 2, 3, 4, 5, 6]).astype(int)
    df['Height_Height_log10p_prod'] = df['Height'] * np.log10(df['Height']).astype(float)
    df['Height_Height_bin6_ratio'] = df['Height'] / pd.qcut(df['Height'], q=6, labels=[1, 2, 3, 4, 5, 6]).astype(int)
    df['Height_Duration_log1p_sum'] = df['Height'] + np.log1p(df['Duration'].astype(float))
    df['Heart_Rate_log10p_Duration_ratio'] = np.log10(df['Heart_Rate']).astype(float) / df['Duration']
    df['Body_Temp_bin6_Duration_ratio'] = pd.qcut(df['Body_Temp'], q=6, labels=[1, 2, 3, 4, 5, 6]).astype(int) / df['Duration']
    df['Heart_Rate_Heart_Rate_log10p_prod'] = df['Heart_Rate'] * np.log10(df['Heart_Rate']).astype(float)
    df['Body_Temp_Height_log10p_prod'] = df['Body_Temp'] * np.log10(df['Height']).astype(float)
    df['Body_Temp_Height_log1p_ratio'] = df['Body_Temp'] / np.log1p(df['Height']).astype(float)
    df['Body_Temp_Heart_Rate_log10p_diff'] = df['Body_Temp'] - np.log10(df['Heart_Rate']).astype(float)
    df['Age_bin6_Weight_log10p_diff'] = pd.qcut(df['Age'], q=6, labels=[1, 2, 3, 4, 5, 6]).astype(int) - np.log10(df['Weight']).astype(float)

    gc.collect()
    return df


test_df = feature_engineering(test_df)
train_df = feature_engineering(train_df)

labels = train_df['Calories']
train_df.drop('Calories',axis=1,inplace=True)

train_df.describe()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,BMI,Intensity,Age_bin6,...,Height_Height_log10p_prod,Height_Height_bin6_ratio,Height_Duration_log1p_sum,Heart_Rate_log10p_Duration_ratio,Body_Temp_bin6_Duration_ratio,Heart_Rate_Heart_Rate_log10p_prod,Body_Temp_Height_log10p_prod,Body_Temp_Height_log1p_ratio,Body_Temp_Heart_Rate_log10p_diff,Age_bin6_Weight_log10p_diff
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,...,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,0.499039,41.420404,174.697685,75.145668,15.421015,95.483995,40.036253,24.374817,10.5474,3.426148,...,391.926944,70.515157,177.316392,0.231794,0.249516,189.255584,89.724903,7.751446,38.058475,1.55782
std,0.499999,15.175049,12.824496,13.982704,8.354095,9.449845,0.779875,1.51131,12.23771,1.724059,...,34.32631,44.365307,12.816835,0.290182,0.120077,22.798371,2.131531,0.189793,0.745391,1.719679
min,0.0,20.0,126.0,36.0,1.0,67.0,37.1,12.375937,2.714286,1.0,...,264.646689,31.5,128.302585,0.063794,0.033333,122.347012,80.313583,6.994436,35.155517,-1.10721
25%,0.0,28.0,164.0,63.0,8.0,88.0,39.6,23.255019,4.521739,2.0,...,363.234391,37.2,166.833213,0.088057,0.2,171.114475,88.447102,7.637057,37.665502,0.102373
50%,0.0,40.0,174.0,74.0,15.0,95.0,40.3,24.391059,6.214286,3.0,...,389.855569,56.333333,177.218876,0.130603,0.222222,187.883743,89.887092,7.768996,38.278811,1.236572
75%,1.0,52.0,185.0,87.0,23.0,103.0,40.7,25.487697,10.75,5.0,...,419.42677,83.0,187.332205,0.241177,0.25,207.322234,91.267715,7.887332,38.643095,3.102373
max,1.0,79.0,222.0,132.0,30.0,128.0,41.5,46.443986,108.0,6.0,...,520.89036,161.0,225.401197,2.033424,1.0,269.722876,96.904378,8.402611,39.508774,4.387216


In [4]:
#%pip install catboost
import catboost as cb
import numpy as np
import pandas as pd  

params= {
    'verbose': 500,
    'task_type': 'GPU',
    'devices':'0',
    'iterations': 5000,
    #'learning_rate': 0.010855964248935647,
    'boosting_type': 'Plain',
    #'depth': 10,
    #'l2_leaf_reg': 7.3734014072244,
    #'min_data_in_leaf': 56,
    'random_seed': 1,
    'loss_function': 'RMSE', 
    #'eval_metric': RMSLEMetric(),
    'cat_features': categorical_cols,

    'learning_rate': 0.007535109261794342, 'depth': 12, 'l2_leaf_reg': 7.724042344564615, 'min_data_in_leaf': 29, 'boosting_type': 'Plain'
}

"""
pool = cb.Pool(train_df, np.log1p(labels), cat_features=categorical_cols)
model = cb.CatBoostRegressor(**params)
model.fit(pool) 
model.save_model("models/catboost_new_new.json")"""

'\npool = cb.Pool(train_df, np.log1p(labels), cat_features=categorical_cols)\nmodel = cb.CatBoostRegressor(**params)\nmodel.fit(pool) \nmodel.save_model("models/catboost_new_new.json")'

In [5]:
import numpy as np
from sklearn.model_selection import KFold

predicts = pd.DataFrame()

y_train = np.log1p(labels)  
X_train = train_df

oof_preds_cb = np.zeros(len(X_train)) 

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_trn, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_trn, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    print(f"\nFold: {fold}\n")
    
    pool_train = cb.Pool(X_trn, y_trn, cat_features=categorical_cols)
    pool_val = cb.Pool(X_val, y_val, cat_features=categorical_cols)
    model = cb.CatBoostRegressor(**params)

    model.fit(
            pool_train,
            eval_set=pool_val,
            early_stopping_rounds =  100,
            use_best_model=True,
            verbose=1000
    )
    
    #pool_oof_val = cb.Pool(X_val,cat_features=categorical_cols)
    oof_preds_cb[val_idx] = model.predict(X_val)

    predicts[f"fold{fold}_predicts"] = model.predict(test_df.drop("id",axis=1))

# Save OOF predictions
oof_preds_cb = np.expm1(oof_preds_cb)
np.save("cb_oof_preds.npy", oof_preds_cb)


Fold: 0



CatBoostError: catboost/cuda/cuda_lib/cuda_base.h:281: CUDA error 999: unknown error