In [18]:
train_path = 'train.csv'
test_path = 'test.csv'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv(train_path)

In [17]:
def outlier_bounds(data, column, prop=0.05):
    lower_bound = data[column].quantile(prop/2)
    upper_bound = data[column].quantile(1 - prop/2)

    return lower_bound, upper_bound

def outlier_removal(data, columns):
    indices = [True] * len(data)
    for column in columns:
        lower_bound, upper_bound = outlier_bounds(data, column)
        indices &= (data[column] >= lower_bound) & (data[column] <= upper_bound)
    return data[indices]


In [30]:
numeric_columns = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'Calories']

for column in numeric_columns:
    print(column, outlier_bounds(train, column))

Age (np.float64(20.0), np.float64(73.0))
Height (np.float64(152.0), np.float64(198.0))
Weight (np.float64(53.0), np.float64(101.0))
Duration (np.float64(2.0), np.float64(29.0))
Heart_Rate (np.float64(78.0), np.float64(113.0))
Body_Temp (np.float64(38.0), np.float64(41.0))
Calories (np.float64(6.0), np.float64(217.0))


In [19]:
train = outlier_removal(train, numeric_columns)
train.shape

  indices &= (data[column] >= lower_bound) & (data[column] <= upper_bound)


(621216, 9)

In [31]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
enc = LabelEncoder()

numeric_columns.remove('Calories')
train = pd.read_csv('train.csv')
train['z'] = np.log1p(train['Calories'])
train = outlier_removal(train, numeric_columns+['z'])
train['Sex'] = enc.fit_transform(train['Sex'])
X_train = train.drop(columns=['id', 'Calories', 'z'])
# y_train = train['Calories']
# z_train = np.log1p(y_train)
z_train = train['z']
lgb_train = lgb.Dataset(X_train, label=z_train)

test = pd.read_csv('test.csv')
test['Sex'] = enc.transform(test['Sex'])
X_test = test.drop(columns=['id'])
# y_train = test['Calories']
# lgb_test = lgb.Dataset(X_test)

params = {
    # "objective": rmsle_lgb_obj,
    "learning_rate": 0.1,
    "max_depth": -1,
    "verbose": -1,
    "seed": 42,
}

model = lgb.train(params,
                  lgb_train,
                  num_boost_round=1000,
                #   feval=rmsle_lgb_eval,
                #   valid_sets=[lgb_train, lgb_eval],
                #   early_stopping_rounds=10,
                # callbacks = [lgb.record_evaluation(evals)]
                )

z_pred = model.predict(X_test)
y_pred = np.expm1(z_pred)
y_pred = np.clip(y_pred, 0, None)  # Ensure predictions are non-negative

  indices &= (data[column] >= lower_bound) & (data[column] <= upper_bound)


Outlier makes model perform a lot worse

# 2-model solution

In [17]:
predictors = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp']
target = 'z'
params_lgb = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": -1,
    "verbose": -1,
    "seed": 42,
}

params_xgb = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.05,
    # "max_depth": 6,
    "verbosity": 0,
    "seed": 42,
}

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import root_mean_squared_error as rsme
enc = LabelEncoder()

train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
indices = train['Sex'] == 0
X = train[predictors]
z = train[target]

Xf = X[indices]
zf = z[indices]

Xm = X[~indices]
zm = z[~indices]




import lightgbm as lgb
from sklearn.model_selection import KFold
import numpy as np
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": -1,
    "verbose": -1,
    "seed": 42,
}

print("Female model")
from sklearn.model_selection import train_test_split
X_train, X_test, z_train, z_test = train_test_split(Xf, zf, test_size=0.2, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmses = []
for i, (train_idx, val_idx) in (enumerate(kf.split(X))):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    z_tr, z_val = z.iloc[train_idx], z.iloc[val_idx]
    lgb_train_cv = lgb.Dataset(X_tr, label=z_tr)
    model_cv = lgb.train(params,
                         lgb_train_cv,
                         num_boost_round=1000)
    z_pred_cv = model_cv.predict(X_val)
    cv_rmses.append(rsme(z_val, z_pred_cv))
    print("Fold RMSE:", cv_rmses[-1])
print("5-Fold CV RMSE (mean):", np.mean(cv_rmses))

print("Male model")
from sklearn.model_selection import train_test_split
X_train, X_test, z_train, z_test = train_test_split(Xm, zm, test_size=0.2, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmses = []
for i, (train_idx, val_idx) in (enumerate(kf.split(X))):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    z_tr, z_val = z.iloc[train_idx], z.iloc[val_idx]
    lgb_train_cv = lgb.Dataset(X_tr, label=z_tr)
    model_cv = lgb.train(params,
                         lgb_train_cv,
                         num_boost_round=1000)
    z_pred_cv = model_cv.predict(X_val)
    cv_rmses.append(rsme(z_val, z_pred_cv))
    print("Fold RMSE:", cv_rmses[-1])
print("5-Fold CV RMSE (mean):", np.mean(cv_rmses))



Fold RMSE: 0.06026363889985602
Fold RMSE: 0.06061225683382612
Fold RMSE: 0.06014281436773985
Fold RMSE: 0.06037775873673849
Fold RMSE: 0.05990351040613537
5-Fold CV RMSE (mean): 0.060259995848859174


In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import root_mean_squared_error as rsme
enc = LabelEncoder()

train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
indices = train['Sex'] == 0
X = train[predictors]
z = train[target]

Xf = X[indices]
zf = z[indices]

Xm = X[~indices]
zm = z[~indices]




import lightgbm as lgb
from sklearn.model_selection import KFold
import numpy as np
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": -1,
    "verbose": -1,
    "seed": 42,
}

print("Female model")


kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmses = []
for i, (train_idx, val_idx) in (enumerate(kf.split(Xf))):
    X_tr, X_val = Xf.iloc[train_idx], Xf.iloc[val_idx]
    z_tr, z_val = zf.iloc[train_idx], zf.iloc[val_idx]
    lgb_train_cv = lgb.Dataset(X_tr, label=z_tr)
    model_cv = lgb.train(params,
                         lgb_train_cv,
                         num_boost_round=1000)
    z_pred_cv = model_cv.predict(X_val)
    cv_rmses.append(rsme(z_val, z_pred_cv))
    print("Fold RMSE:", cv_rmses[-1])
print("5-Fold CV RMSE (mean):", np.mean(cv_rmses))

print("Male model")


kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmses = []
for i, (train_idx, val_idx) in (enumerate(kf.split(Xm))):
    X_tr, X_val = Xm.iloc[train_idx], Xm.iloc[val_idx]
    z_tr, z_val = zm.iloc[train_idx], zm.iloc[val_idx]
    lgb_train_cv = lgb.Dataset(X_tr, label=z_tr)
    model_cv = lgb.train(params,
                         lgb_train_cv,
                         num_boost_round=1000)
    z_pred_cv = model_cv.predict(X_val)
    cv_rmses.append(rsme(z_val, z_pred_cv))
    print("Fold RMSE:", cv_rmses[-1])
print("5-Fold CV RMSE (mean):", np.mean(cv_rmses))

Female model
Fold RMSE: 0.04903869814694339
Fold RMSE: 0.049777323418691774
Fold RMSE: 0.0493589707782226
Fold RMSE: 0.04951170309985049
Fold RMSE: 0.05151598973449904
5-Fold CV RMSE (mean): 0.049840537035641466
Male model
Fold RMSE: 0.06962670041615204
Fold RMSE: 0.06850289382786343
Fold RMSE: 0.06870631042324044
Fold RMSE: 0.07014221869468071
Fold RMSE: 0.06871563948338268
5-Fold CV RMSE (mean): 0.06913875256906385


## Generate Predictions

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import lightgbm as lgb
# from sklearn.metrics import mean_squared_error as mse
enc = LabelEncoder()
train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
indices = train['Sex'] == 0
X = train[predictors]
z = train[target]

Xf = X[indices]
zf = z[indices]


Xm = X[~indices]
zm = z[~indices]

lgb_train_f = lgb.Dataset(Xf, label=zf)
lgb_train_m = lgb.Dataset(Xm, label=zm)

model_f = lgb.train(params,
                    lgb_train_f,
                    num_boost_round=1000)
model_m = lgb.train(params,
                    lgb_train_m,
                    num_boost_round=1000)

test = pd.read_csv('test.csv')
test['Sex'] = enc.transform(test['Sex'])
indices_test = test['Sex']==0

test_f = test[indices_test]
test_m = test[~indices_test]

z_pred_f = model_f.predict(test_f[predictors])
z_pred_m = model_m.predict(test_m[predictors])

z_pred = np.zeros(len(test))
z_pred[indices_test] = z_pred_f
z_pred[~indices_test] = z_pred_m

# z_pred = test['z']

y_pred = np.clip(np.expm1(z_pred), 0, None)


In [14]:
z_pred

array([3.32691559, 4.70222584, 4.48474321, ..., 4.30134935, 5.13576943,
       4.34437577])

2-model lgb is already better than 1-model ensemble of lgb, xgb

# 2-model avg of LGB, XGB

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import lightgbm as lgb
import xgboost as xgb
# from sklearn.metrics import mean_squared_error as mse
enc = LabelEncoder()

# Prepare the train data
train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
X = train[predictors]
z = train[target]
# Extract female indices
indices = train['Sex'] == 0
# Separate the datasets
Xf = X[indices]
zf = z[indices]

Xm = X[~indices]
zm = z[~indices]

# Prepare the test data
test = pd.read_csv('test.csv')
test['Sex'] = enc.transform(test['Sex'])
# Extract female indices
indices_test = test['Sex']==0
# Separate the datasets
test_f = test[indices_test]
test_m = test[~indices_test]



# Run 2-model_LGB
print("Running 2-model LGB")
lgb_train_f = lgb.Dataset(Xf, label=zf)
lgb_train_m = lgb.Dataset(Xm, label=zm)

model_f_lgb = lgb.train(params,
                    lgb_train_f,
                    num_boost_round=1000)
model_m_lgb = lgb.train(params,
                    lgb_train_m,
                    num_boost_round=1000)

z_pred_f_lgb = model_f_lgb.predict(test_f[predictors])
z_pred_m_lgb = model_m_lgb.predict(test_m[predictors])

z_pred_lgb = np.zeros(len(test))
z_pred_lgb[indices_test] = z_pred_f_lgb
z_pred_lgb[~indices_test] = z_pred_m_lgb

# Run 2-model_XGB
print("Running 2-model XGB")
xgb_train_f = xgb.DMatrix(Xf, label=zf)
xgb_train_m = xgb.DMatrix(Xm, label=zm)
model_f_xgb = xgb.train(params_xgb,
                    xgb_train_f,
                    num_boost_round=1000)
model_m_xgb = xgb.train(params_xgb,
                    xgb_train_m,
                    num_boost_round=1000)
z_pred_f_xgb = model_f_xgb.predict(xgb.DMatrix(test_f[predictors]))
z_pred_m_xgb = model_m_xgb.predict(xgb.DMatrix(test_m[predictors]))

z_pred_xgb = np.zeros(len(test))
z_pred_xgb[indices_test] = z_pred_f_xgb
z_pred_xgb[~indices_test] = z_pred_m_xgb

# Combine the predictions
z_pred = (z_pred_lgb + z_pred_xgb) / 2

y_pred = np.clip(np.expm1(z_pred), 0, None)


Running 2-model LGB
Running 2-model XGB


# Submission

In [21]:
def generate_submission(y_pred, submission_name):
    
    df_submission = pd.read_csv('sample_submission.csv')
    df_submission['Calories'] = y_pred
    df_submission.to_csv(submission_name, index=False)

submission_name = '2model_avg_xgb_lgb'
generate_submission(y_pred, f'submissions\\{submission_name}.csv')