# Import Packages

In [49]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
import pandas as pd

# Load Preprocessed Dataset

In [50]:
data = pd.read_csv('new_train.csv')
data.head()

Unnamed: 0,X2,X4,X6,X8,X9_encoded,X10_encoded,X11_encoded,X3_encoded,X5_encoded,X7_encoded,Y
0,9.3,0.016047,249.8092,1999.0,2.0,1.0,2.0,0.0,4.0,9.0,8.23
1,5.92,0.019278,48.2692,2009.0,2.0,3.0,3.0,1.0,14.0,3.0,6.09
2,17.5,0.01676,141.618,1999.0,2.0,1.0,2.0,0.0,10.0,9.0,7.65
3,19.2,0.0,182.095,1998.0,1.0,3.0,1.0,1.0,6.0,0.0,6.6
4,8.93,0.0,53.8614,1987.0,3.0,3.0,2.0,0.0,9.0,1.0,6.9


# Data Splitting

In [51]:
X = data.drop(['Y'], axis=1)
y = data['Y']
print(X.shape)
print(y.shape)

(6000, 10)
(6000,)


# Model Training with XGBoost

In [52]:
reg = xgb.XGBRegressor(objective='reg:absoluteerror')

param_grid = {
    'max_depth': [1, 2, 3],          # Maximum depth of a tree
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],  # Learning rate
    'n_estimators': [240, 260, 280, 300, 320, 340, 360, 380, 400, 420, 440, 460, 480, 500],  # Number of boosting rounds (trees)
    'subsample': [0.8, 1.0],         # Fraction of samples used in each boosting round
    'colsample_bytree': [0.8, 1.0]   # Fraction of features used for each tree
}

grid_search = GridSearchCV(estimator=reg, param_grid=param_grid, 
                           scoring='neg_mean_absolute_error', cv=10, n_jobs=-1)
grid_search.fit(X, y)
best_xgb_params = grid_search.best_params_
best_xgb_params

{'colsample_bytree': 0.8,
 'learning_rate': 0.1,
 'max_depth': 1,
 'n_estimators': 260,
 'subsample': 1.0}

In [53]:
best_xgb = xgb.XGBRegressor(**best_xgb_params, objective='reg:absoluteerror')
best_xgb.fit(X, y)

# Model Training with LightGBM

In [61]:
lgb_model = lgb.LGBMRegressor(objective='mae')

lgb_param_grid = {
    'num_leaves': [3, 5, 10, 20, 30],  # Increased range for better learning capacity
    'learning_rate': [0.2, 0.1, 0.05, 0.01],  # Added smaller values for fine-tuning
    'n_estimators': [200, 250, 300, 400],  # Added higher values for longer training
    'max_depth': [1, 2, 3, 5, 7],  # Increased depth for better splits
    'min_child_samples': [1, 5, 10, 20],  # Adjusted to prevent overfitting
    'min_gain_to_split': [0.0, 0.01, 0.1],  # Ensures meaningful splits
    'colsample_bytree': [0.6, 0.8, 1.0],  # Added feature selection improvement
    'subsample': [0.6, 0.8, 1.0]  # Helps with generalization
}

lgb_grid = GridSearchCV(lgb_model, lgb_param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
lgb_grid.fit(X, y)

best_lgb_params = lgb_grid.best_params_
best_lgb_params

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 793
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 10
[LightGBM] [Info] Start training from score 7.500000


{'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': 1,
 'min_child_samples': 10,
 'min_gain_to_split': 0.0,
 'n_estimators': 250,
 'num_leaves': 3,
 'subsample': 0.6}

In [62]:
best_lgb = lgb.LGBMRegressor(**best_lgb_params, objective='mae')
best_lgb.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 793
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 10
[LightGBM] [Info] Start training from score 7.500000


# Final Predictions

In [63]:
test_data = pd.read_csv('new_test.csv')
test_data.head()

Unnamed: 0,X2,X4,X6,X8,X9_encoded,X10_encoded,X11_encoded,X3_encoded,X5_encoded,X7_encoded
0,8.42,0.021273,229.1326,1985.0,2.0,3.0,4.0,0.0,2.0,5.0
1,17.35,0.027588,86.1856,1987.0,3.0,3.0,2.0,1.0,3.0,1.0
2,9.3,0.111782,182.3292,1998.0,1.0,3.0,1.0,0.0,5.0,0.0
3,6.71,0.029606,65.4142,1999.0,2.0,1.0,2.0,1.0,0.0,9.0
4,17.1,0.129141,109.4886,2007.0,2.0,2.0,2.0,0.0,6.0,2.0


In [57]:
predictions = best_xgb.predict(test_data)
predictions

array([8.711283 , 7.2275033, 5.995428 , ..., 7.233546 , 7.263668 ,
       7.130743 ], dtype=float32)

In [64]:
predictions_1 = best_lgb.predict(test_data)
predictions_1



array([8.70793503, 7.22609876, 6.00213472, ..., 7.24089067, 7.26886931,
       7.14042351])

In [59]:
final_predictions = pd.DataFrame()
final_predictions['row_id'] = [i for i in range(test_data.shape[0])]
final_predictions['Y'] = predictions
final_predictions.to_csv("Predictions.csv", index=False)

In [65]:
final_predictions_1 = pd.DataFrame()
final_predictions_1['row_id'] = [i for i in range(test_data.shape[0])]
final_predictions_1['Y'] = predictions_1
final_predictions_1.to_csv("Predictions_1.csv", index=False)