In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
import optuna

In [None]:
train = pd.read_csv("../input/30-days-ml-challenge-kfolds-train-dataset/train_folds.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
category_cols = [col for col in train.columns if train[col].dtype == "object"]
numerical_cols = list(set(train.columns) - set(category_cols) - set(["target", "kfold"]))
useful_features = category_cols + numerical_cols
print(category_cols)
print(numerical_cols)
print(useful_features)

In [None]:
def find_best_params(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)
    
    model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=0)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [None]:
def run(fold):
    global xtrain
    global ytrain
    global xvalid
    global yvalid
    
    xtrain = train[train.kfold!=fold].reset_index(drop=True) 
    xvalid = train[train.kfold==fold].reset_index(drop=True) 
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    xtest = test[useful_features].copy()
 
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = OrdinalEncoder()
    xtrain[category_cols] = ordinal_encoder.fit_transform(xtrain[category_cols])
    xvalid[category_cols] = ordinal_encoder.transform(xvalid[category_cols])
    xtest[category_cols] = ordinal_encoder.transform(xtest[category_cols])
    
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    study = optuna.create_study(direction="minimize")
    study.optimize(find_best_params, n_trials = 10, n_jobs = -1, show_progress_bar=False)
    
    print('_'*80)
    print(f"Best Params for {fold} Fold:\n", study.best_params)
    
    model = XGBRegressor(random_state=42,
                         tree_method="gpu_hist",
                         gpu_id=0,
                         predictor="gpu_predictor",
                         n_estimators=7000,
                         **study.best_params
                        )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=0)
    preds_valid = model.predict(xvalid)
    preds_train = model.predict(xtrain)
    test_preds = model.predict(xtest)
    print('-'*80)
    print(f"Fold = {fold}, valid MSE = {mean_squared_error(yvalid, preds_valid, squared=False)}, train MSE = {mean_squared_error(ytrain, preds_train, squared=False)}")
    print('_'*80)
    return test_preds

In [None]:
# Uncomment and run this cell
final_pred=[]
for fold in range(5):
    final_pred.append(run(fold))

In [None]:
preds = np.mean(np.column_stack(final_pred), axis=1)
submission.target = preds
submission.to_csv("submission.csv", index=False)

In [None]:
submission

In [None]:
for i in range(5):
    submission.target = final_pred[i]
    submission.to_csv(f"submission{i}.csv", index=False)