# Import Packages

In [19]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import pandas as pd

# Load Preprocessed Dataset

In [20]:
data = pd.read_csv('new_train.csv')
data.head()

Unnamed: 0,X2,X4,X6,X8,X9_encoded,X10_encoded,X11_encoded,X3_encoded,X5_encoded,X7_encoded,Y
0,9.3,0.016047,249.8092,1999.0,2.0,1.0,2.0,0.0,4.0,9.0,8.23
1,5.92,0.019278,48.2692,2009.0,2.0,3.0,3.0,1.0,14.0,3.0,6.09
2,17.5,0.01676,141.618,1999.0,2.0,1.0,2.0,0.0,10.0,9.0,7.65
3,19.2,0.0,182.095,1998.0,1.0,3.0,1.0,1.0,6.0,0.0,6.6
4,8.93,0.0,53.8614,1987.0,3.0,3.0,2.0,0.0,9.0,1.0,6.9


# Data Splitting

In [21]:
X = data.drop(['Y'], axis=1)
y = data['Y']
print(X.shape)
print(y.shape)

(6000, 10)
(6000,)


# Model Training with XGBoost

In [None]:
reg = xgb.XGBRegressor(objective='reg:absoluteerror')

param_grid = {
    'max_depth': [1, 2, 3],          # Maximum depth of a tree
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],  # Learning rate
    'n_estimators': [240, 260, 280, 300, 320, 340, 360, 380, 400, 420, 440, 460, 480, 500],  # Number of boosting rounds (trees)
    'subsample': [0.8, 1.0],         # Fraction of samples used in each boosting round
    'colsample_bytree': [0.8, 1.0]   # Fraction of features used for each tree
}

grid_search = GridSearchCV(estimator=reg, param_grid=param_grid, 
                           scoring='neg_mean_absolute_error', cv=10, n_jobs=-1)
grid_search.fit(X, y)
best_xgb = grid_search.best_estimator_
best_xgb

# Model Training with LightGBM

In [None]:
lgb_model = lgb.LGBMRegressor(objective='mae')

lgb_param_grid = {
    'num_leaves': [31, 50, 70, 100],
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [-1, 10, 20, 30],
    'min_child_samples': [5, 10, 20],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

lgb_grid = GridSearchCV(lgb_model, lgb_param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
lgb_grid.fit(X, y)

best_lgb = lgb_grid.best_estimator_
best_lgb

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 793
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 10
[LightGBM] [Info] Start training from score 7.500000


# Model Training with CatBoost

In [None]:
import os
os.makedirs("catboost_info", exist_ok=True)

cb_model = cb.CatBoostRegressor(loss_function='MAE')

cb_param_grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
    'n_estimators': [100, 150, 200, 250, 300],
    'iterations': [100, 300, 500],
    'l2_leaf_reg': [1, 3, 5, 10],
    'border_count': [32, 50, 100]
}

cb_grid = GridSearchCV(cb_model, cb_param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
cb_grid.fit(X, y)

best_cb = cb_grid.best_estimator_
best_cb

0:	learn: 0.7680040	total: 1.89ms	remaining: 1.89s
1:	learn: 0.7527633	total: 3.75ms	remaining: 1.87s
2:	learn: 0.7377571	total: 6.88ms	remaining: 2.29s
3:	learn: 0.7233218	total: 8.53ms	remaining: 2.12s
4:	learn: 0.7094582	total: 10.1ms	remaining: 2s
5:	learn: 0.6963980	total: 11.5ms	remaining: 1.91s
6:	learn: 0.6834313	total: 13.1ms	remaining: 1.86s
7:	learn: 0.6712091	total: 14.7ms	remaining: 1.83s
8:	learn: 0.6596491	total: 16.4ms	remaining: 1.8s
9:	learn: 0.6494249	total: 18ms	remaining: 1.78s
10:	learn: 0.6387060	total: 19.8ms	remaining: 1.77s
11:	learn: 0.6279878	total: 22.6ms	remaining: 1.86s
12:	learn: 0.6176877	total: 24.3ms	remaining: 1.85s
13:	learn: 0.6080399	total: 26.2ms	remaining: 1.84s
14:	learn: 0.5987755	total: 27.8ms	remaining: 1.82s
15:	learn: 0.5895960	total: 29.4ms	remaining: 1.8s
16:	learn: 0.5810586	total: 30.7ms	remaining: 1.78s
17:	learn: 0.5730370	total: 32.3ms	remaining: 1.76s
18:	learn: 0.5650041	total: 33.9ms	remaining: 1.75s
19:	learn: 0.5582349	total: 3

<catboost.core.CatBoostRegressor at 0x1c65a1d6b60>

# Final Predictions

In [25]:
test_data = pd.read_csv('new_test.csv')
test_data.head()

Unnamed: 0,X2,X4,X6,X8,X9_encoded,X10_encoded,X11_encoded,X3_encoded,X5_encoded,X7_encoded
0,8.42,0.021273,229.1326,1985.0,2.0,3.0,4.0,0.0,2.0,5.0
1,17.35,0.027588,86.1856,1987.0,3.0,3.0,2.0,1.0,3.0,1.0
2,9.3,0.111782,182.3292,1998.0,1.0,3.0,1.0,0.0,5.0,0.0
3,6.71,0.029606,65.4142,1999.0,2.0,1.0,2.0,1.0,0.0,9.0
4,17.1,0.129141,109.4886,2007.0,2.0,2.0,2.0,0.0,6.0,2.0


In [26]:
predictions = best_xgb.predict(test_data)
predictions

array([8.590439 , 6.9578466, 6.173207 , ..., 6.9789805, 7.4474316,
       7.0943556], dtype=float32)

In [27]:
predictions_1 = best_lgb.predict(test_data)
predictions_1

array([8.72298815, 7.16164447, 6.30007839, ..., 7.21351257, 7.44927694,
       7.16420576])

In [28]:
predictions_2 = best_cb.predict(test_data)
predictions_2

array([8.62031015, 7.14020832, 6.05781751, ..., 7.1382078 , 7.24236514,
       6.85372465])

In [29]:
final_predictions = pd.DataFrame()
final_predictions['row_id'] = [i for i in range(test_data.shape[0])]
final_predictions['Y'] = predictions
final_predictions.to_csv("Predictions.csv", index=False)

In [30]:
final_predictions_1 = pd.DataFrame()
final_predictions_1['row_id'] = [i for i in range(test_data.shape[0])]
final_predictions_1['Y'] = predictions_1
final_predictions_1.to_csv("Predictions_1.csv", index=False)

In [31]:
final_predictions_2 = pd.DataFrame()
final_predictions_2['row_id'] = [i for i in range(test_data.shape[0])]
final_predictions_2['Y'] = predictions_2
final_predictions_2.to_csv("Predictions_2.csv", index=False)