# Import Packages

In [15]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
import pandas as pd

# Load Preprocessed Dataset

In [16]:
data = pd.read_csv('new_train.csv')
data.head()

Unnamed: 0,X2,X4,X6,X8,X9_encoded,X10_encoded,X11_encoded,X3_encoded,X5_encoded,X7_encoded,Y
0,-0.784936,-0.976653,1.736314,0.139154,0.33525,-1.382381,-0.256342,-0.736758,-0.761685,1.504304,8.23
1,-1.51051,-0.913903,-1.486503,1.339098,0.33525,1.090572,1.006425,1.357297,1.612891,-0.609637,6.09
2,0.97533,-0.96281,0.006233,0.139154,0.33525,-1.382381,-0.256342,-0.736758,0.663061,1.504304,7.65
3,1.340264,-1.288324,0.653499,0.019159,1.758902,1.090572,-1.519109,1.357297,-0.28677,-1.666608,6.6
4,-0.864363,-1.288324,-1.397078,-1.30078,1.758902,1.090572,-0.256342,-0.736758,0.425603,-1.314284,6.9


# Data Splitting

In [17]:
X = data.drop(['Y'], axis=1)
y = data['Y']

print(X.shape)
print(y.shape)

(6000, 10)
(6000,)


# Model Training with XGBoost

In [18]:
reg = xgb.XGBRegressor(objective='reg:absoluteerror')

param_grid = {
    'max_depth': [1],          # Maximum depth of a tree
    'learning_rate': [0.1],  # Learning rate
    'n_estimators': [260],  # Number of boosting rounds (trees)
    'subsample': [1.0],         # Fraction of samples used in each boosting round
    'colsample_bytree': [0.8]   # Fraction of features used for each tree
}

grid_search = GridSearchCV(estimator=reg, param_grid=param_grid, 
                           scoring='neg_mean_absolute_error', cv=10, n_jobs=-1)
grid_search.fit(X, y)
best_xgb_params = grid_search.best_params_
best_xgb_params

{'colsample_bytree': 0.8,
 'learning_rate': 0.1,
 'max_depth': 1,
 'n_estimators': 260,
 'subsample': 1.0}

In [19]:
best_xgb = xgb.XGBRegressor(**best_xgb_params, objective='reg:absoluteerror')
best_xgb.fit(X, y)

# Model Training with LightGBM

In [20]:
lgb_model = lgb.LGBMRegressor(objective='mae')

lgb_param_grid = {
    'num_leaves': [3],  # Increased range for better learning capacity
    'learning_rate': [0.1],  # Added smaller values for fine-tuning
    'n_estimators': [250],  # Added higher values for longer training
    'max_depth': [1],  # Increased depth for better splits
    'min_child_samples': [10],  # Adjusted to prevent overfitting
    'min_gain_to_split': [0.0],  # Ensures meaningful splits
    'colsample_bytree': [1.0],  # Added feature selection improvement
    'subsample': [0.6]  # Helps with generalization
}

lgb_grid = GridSearchCV(lgb_model, lgb_param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
lgb_grid.fit(X, y)

best_lgb_params = lgb_grid.best_params_
best_lgb_params

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 10
[LightGBM] [Info] Start training from score 7.500000


{'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': 1,
 'min_child_samples': 10,
 'min_gain_to_split': 0.0,
 'n_estimators': 250,
 'num_leaves': 3,
 'subsample': 0.6}

In [21]:
best_lgb = lgb.LGBMRegressor(**best_lgb_params, objective='mae')
best_lgb.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 10
[LightGBM] [Info] Start training from score 7.500000


# Final Predictions

In [22]:
test_data = pd.read_csv('new_test.csv')
test_data.head()

Unnamed: 0,X2,X4,X6,X8,X9_encoded,X10_encoded,X11_encoded,X3_encoded,X5_encoded,X7_encoded
0,-1.051123,-0.875157,1.405675,-1.540769,0.33525,1.090572,2.269192,-0.736758,-1.2366,0.09501
1,0.94313,-0.752512,-0.880184,-1.30078,1.758902,1.090572,-0.256342,1.357297,-0.999142,-1.314284
2,-0.784936,0.882696,0.657244,0.019159,1.758902,1.090572,-1.519109,-0.736758,-0.524227,-1.666608
3,-1.340923,-0.713308,-1.212338,0.139154,0.33525,-1.382381,-0.256342,1.357297,-1.711515,1.504304
4,0.889464,1.219849,-0.507546,1.099109,-1.088402,-0.145904,-0.256342,-0.736758,-0.28677,-0.961961


In [23]:
predictions = best_xgb.predict(test_data)
predictions

array([8.715311 , 7.2280383, 5.9946766, ..., 7.240662 , 7.2533836,
       7.133437 ], dtype=float32)

In [24]:
predictions_1 = best_lgb.predict(test_data)
predictions_1



array([8.71672965, 7.20769631, 5.99516339, ..., 7.22390385, 7.26610055,
       7.12473264])

In [25]:
final_predictions = pd.DataFrame()
final_predictions['row_id'] = [i for i in range(test_data.shape[0])]
final_predictions['Y'] = predictions
final_predictions.to_csv("Predictions.csv", index=False)

In [26]:
final_predictions_1 = pd.DataFrame()
final_predictions_1['row_id'] = [i for i in range(test_data.shape[0])]
final_predictions_1['Y'] = predictions_1
final_predictions_1.to_csv("Predictions_1.csv", index=False)