In [2]:
predictors = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp']
target = 'z'
params_lgb = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": -1,
    "verbose": -1,
    "seed": 42,
}

params_xgb = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.05,
    # "max_depth": 6,
    "verbosity": 0,
    "seed": 42,
}

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import lightgbm as lgb
import xgboost as xgb
# from sklearn.metrics import mean_squared_error as mse
enc = LabelEncoder()

# Prepare the train data
train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
X = train[predictors]
z = train[target]
# Extract female indices
indices = train['Sex'] == 0
# Separate the datasets
Xf = X[indices]
zf = z[indices]

Xm = X[~indices]
zm = z[~indices]

# Prepare the test data
test = pd.read_csv('test.csv')
test['Sex'] = enc.transform(test['Sex'])
# Extract female indices
indices_test = test['Sex']==0
# Separate the datasets
test_f = test[indices_test]
test_m = test[~indices_test]

# 2-model HGB

In [3]:
from sklearn.ensemble import HistGradientBoostingRegressor



model_f_hgb = HistGradientBoostingRegressor(learning_rate=0.05,
                                          max_iter=1000,
                                          random_state=42)
model_m_hgb = HistGradientBoostingRegressor(learning_rate=0.05,
                                          max_iter=1000,
                                          random_state=42)

model_f_hgb.fit(Xf, zf)
model_m_hgb.fit(Xm, zm)

z_pred_f_hgb = model_f_hgb.predict(test_f[predictors])
z_pred_m_hgb = model_m_hgb.predict(test_m[predictors])

# Combine the predictions
z_pred_hgb = np.zeros(len(test))
z_pred_hgb[indices_test] = z_pred_f_hgb
z_pred_hgb[~indices_test] = z_pred_m_hgb

# z_pred = test['z']

y_pred = np.clip(np.expm1(z_pred_hgb), 0, None)

# 2-model LGB

In [None]:
lgb_train_f = lgb.Dataset(Xf, label=zf)
lgb_train_m = lgb.Dataset(Xm, label=zm)

model_f = lgb.train(params_lgb,
                    lgb_train_f,
                    num_boost_round=1000)
model_m = lgb.train(params_lgb,
                    lgb_train_m,
                    num_boost_round=1000)


z_pred_f_lgb = model_f.predict(test_f[predictors])
z_pred_m_lgb = model_m.predict(test_m[predictors])

z_pred_lgb = np.zeros(len(test))
z_pred_lgb[indices_test] = z_pred_f_lgb
z_pred_lgb[~indices_test] = z_pred_m_lgb

# z_pred_lgb = test['z']

y_pred = np.clip(np.expm1(z_pred_lgb), 0, None)

# 2-model XGB

In [7]:
xgb_train_f = xgb.DMatrix(Xf, label=zf)
xgb_train_m = xgb.DMatrix(Xm, label=zm)
model_f_xgb = xgb.train(params_xgb,
                    xgb_train_f,
                    num_boost_round=1000)
model_m_xgb = xgb.train(params_xgb,
                    xgb_train_m,
                    num_boost_round=1000)
z_pred_f_xgb = model_f_xgb.predict(xgb.DMatrix(test_f[predictors]))
z_pred_m_xgb = model_m_xgb.predict(xgb.DMatrix(test_m[predictors]))

z_pred_xgb = np.zeros(len(test))
z_pred_xgb[indices_test] = z_pred_f_xgb
z_pred_xgb[~indices_test] = z_pred_m_xgb

# Ensemble of LGB, XGB, HGB

## Mean

### Crossval results

In [4]:
# Cross-validation setup
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

enc = LabelEncoder()

# Prepare the train data
train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
X = train[predictors]
z = train[target]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses = []
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    z_train, z_val = z.iloc[train_index], z.iloc[val_index]
    indices_train = X_train['Sex'] == 0
    indices_val = X_val['Sex'] == 0
    # Separate the datasets

    # Train the model
    ## HGB
    model_f_hgb = HistGradientBoostingRegressor(learning_rate=0.05,
                                          max_iter=1000,
                                          random_state=42)
    model_m_hgb = HistGradientBoostingRegressor(learning_rate=0.05,
                                            max_iter=1000,
                                            random_state=42)
    model_f_hgb.fit(X_train[indices_train], z_train[indices_train])
    model_m_hgb.fit(X_train[~indices_train], z_train[~indices_train])
    ## LGB
    lgb_train_f = lgb.Dataset(X_train[indices_train], label=z_train[indices_train])
    lgb_train_m = lgb.Dataset(X_train[~indices_train], label=z_train[~indices_train])
    model_f = lgb.train(params_lgb,
                        lgb_train_f,
                        num_boost_round=1000)
    model_m = lgb.train(params_lgb,
                        lgb_train_m,
                        num_boost_round=1000)
    ## XGB
    xgb_train_f = xgb.DMatrix(X_train[indices_train], label=z_train[indices_train])
    xgb_train_m = xgb.DMatrix(X_train[~indices_train], label=z_train[~indices_train])
    model_f_xgb = xgb.train(params_xgb,
                        xgb_train_f,
                        num_boost_round=1000)
    model_m_xgb = xgb.train(params_xgb,
                        xgb_train_m,
                        num_boost_round=1000)

    # Make predictions
    z_pred_hgb_f, z_pred_lgb_f, z_pred_xgb_f = model_f_hgb.predict(X_val[indices_val]), model_f.predict(X_val[indices_val]), model_f_xgb.predict(xgb.DMatrix(X_val[indices_val]))
    z_pred_hgb_m, z_pred_lgb_m, z_pred_xgb_m = model_m_hgb.predict(X_val[~indices_val]), model_m.predict(X_val[~indices_val]), model_m_xgb.predict(xgb.DMatrix(X_val[~indices_val]))
    
    # Ensemble by means and medians
    z_pred_f_mean, z_pred_f_median = np.mean([z_pred_hgb_f, z_pred_lgb_f, z_pred_xgb_f], axis=0), np.median([z_pred_hgb_f, z_pred_lgb_f, z_pred_xgb_f], axis=0)
    z_pred_m_mean, z_pred_m_median = np.mean([z_pred_hgb_m, z_pred_lgb_m, z_pred_xgb_m], axis=0), np.median([z_pred_hgb_m, z_pred_lgb_m, z_pred_xgb_m], axis=0)

    # Combine predictions
    z_pred_mean, z_pred_median = np.zeros(len(X_val)), np.zeros(len(X_val))
    z_pred_mean[indices_val], z_pred_median[indices_val] = z_pred_f_mean, z_pred_f_median
    z_pred_mean[~indices_val], z_pred_median[~indices_val] = z_pred_m_mean, z_pred_m_median


    # Calculate RMSE
    rmse_mean, rmse_median = np.sqrt(mean_squared_error(z_pred_mean, z_val)), np.sqrt(mean_squared_error(z_pred_median, z_val))
    print(f"RMSE (mean): {rmse_mean}, RMSE (median): {rmse_median}")
    rmses.append((rmse_mean, rmse_median))

RMSE (mean): 0.05957968473024043, RMSE (median): 0.05966007361370051
RMSE (mean): 0.06057475373718219, RMSE (median): 0.06070332258385359
RMSE (mean): 0.05948436169246183, RMSE (median): 0.059734527877212285
RMSE (mean): 0.060023019053754324, RMSE (median): 0.06007363141063316
RMSE (mean): 0.059515753432951554, RMSE (median): 0.059614183073461564


In [8]:
z_pred = np.mean([z_pred_hgb, z_pred_lgb, z_pred_xgb], axis=0)

y_pred = np.clip(np.expm1(z_pred), 0, None)

'2.1.3'

## Median

In [10]:
z_pred = np.median([z_pred_hgb, z_pred_lgb, z_pred_xgb], axis=0)

y_pred = np.clip(np.expm1(z_pred), 0, None)

# Submission

In [11]:
def generate_submission(y_pred, submission_name):
    
    df_submission = pd.read_csv('sample_submission.csv')
    df_submission['Calories'] = y_pred
    df_submission.to_csv(submission_name, index=False)

submission_name = '2model_median_lgb_xgb_hgb'
generate_submission(y_pred, f'submissions\\{submission_name}.csv')