## Preprocessing

In [None]:
import numpy as np
import pandas as pd

import optuna
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/30-days-of-ml/train.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")

In [None]:
# df = pd.read_csv("../input/30days-folds/train_folds.csv")
# df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
# sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

# df1 = pd.read_csv("../input/30d-ml/working_K/level1_train_pred_1.csv")
# df2 = pd.read_csv("../input/30d-ml/working_K/level1_train_pred_2.csv")
# df3 = pd.read_csv("../input/30d-ml/working_K/level1_train_pred_3.csv")


# df_test1 = pd.read_csv("level1_test_pred_1.csv")
# df_test2 = pd.read_csv("level1_test_pred_2.csv")
# df_test3 = pd.read_csv("level1_test_pred_3.csv")
# # df_test4 = pd.read_csv("level1_test_pred_4.csv")

# df = df.merge(df1, on="id", how="left")
# df = df.merge(df2, on="id", how="left")
# df = df.merge(df3, on="id", how="left")
# # df = df.merge(df4, on="id", how="left")

# df_test = df_test.merge(df_test1, on="id", how="left")
# df_test = df_test.merge(df_test2, on="id", how="left")
# df_test = df_test.merge(df_test3, on="id", how="left")

In [None]:
# List of categorial features
cat_features = [col for col in train.columns if train[col].dtypes == 'object']
print(len(cat_features), cat_features)

In [None]:
# Set target column
y = train.target
X = train.drop(['id', 'target'], axis=1)
test.drop(['id'], axis=1, inplace=True)

In [None]:
# Divide into train and validation datasets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=77)

# Ordinal-encode categorical columns
X_test = test.copy()
ordinal_encoder = OrdinalEncoder()
X_train[cat_features] = ordinal_encoder.fit_transform(X_train[cat_features])
X_valid[cat_features] = ordinal_encoder.transform(X_valid[cat_features])
X_test[cat_features] = ordinal_encoder.transform(X_test[cat_features])

## Untuned model

In [None]:
%%time
# Simple model with GPU
model = XGBRegressor(random_state=2510, tree_method='gpu_hist')
model.fit(X_train, y_train, 
          early_stopping_rounds=10, 
          eval_set=[(X_valid, y_valid)],
          verbose=False)

# generate predictions
valid_preds = model.predict(X_valid)

rmse = mean_squared_error(y_valid, valid_preds, squared=False)
print("RMSE:", rmse)

## Optimization

In [None]:
def objective(trial):
    params = {
        'booster':'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 500, 7000, 500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.40, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 7),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 40),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
#         'max_depth': 4,
#         'n_estimators': 6000,
        'predictor':'gpu_predictor',
        
    }
    
    # XGBoost model
    model = XGBRegressor(**params, random_state=2510, tree_method='gpu_hist')
    model.fit(X_train, y_train, 
              early_stopping_rounds=100, 
              eval_set=[(X_valid, y_valid)],
              verbose=False)

    # generate predictions
    valid_preds = model.predict(X_valid)

    rmse = mean_squared_error(y_valid, valid_preds, squared=False)
    return rmse

In [None]:
# Create study that minimizes
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=500) # change n_trials to 50, 100, 500, 1000

print("Number of finished trials: ", len(study.trials))
print("Best value:", study.best_trial.value)
print("Best parameters:", study.best_params)

## Best hyperparameters

In [None]:
study.best_params

## Visualization

In [None]:
optuna.visualization.matplotlib.plot_optimization_history(study)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(study)

In [None]:
best_params = study.best_params
# best_params = {
#      'n_estimators': 6000,
#      'learning_rate': 0.03854075470695709,
#      'max_depth': 4,
#      'subsample': 0.7765482635167552,
#      'colsample_bytree': 0.1752978982571639,
#      'min_child_weight': 37,
#      'gamma': 3.8394925536670776e-07,
#      'alpha': 0.000697976480249658,
#      'lambda': 0.04651536374944249
# }
best_params

## Generating Test predictions

In [None]:
%%time
# Cross validation with Kfold
splits = 20
kf = KFold(n_splits=splits, shuffle=True, random_state=2510)

mean_rmse = 0
test_predictions = 0

for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
    # Divide train and validation data using folds
    X_train, X_valid = X.loc[train_indices], X.loc[valid_indices]
    y_train, y_valid = y.loc[train_indices], y.loc[valid_indices]
    
    # Ordinal-encode categorical columns
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_features] = ordinal_encoder.fit_transform(X_train[cat_features])
    X_valid[cat_features] = ordinal_encoder.transform(X_valid[cat_features])
    
    # Final model
    model = XGBRegressor(**best_params, random_state=2510, tree_method='gpu_hist')
    model.fit(X_train, y_train,
              early_stopping_rounds=best_params['n_estimators']//20,
              eval_set=[(X_valid, y_valid)],
              verbose=False)
    
    # generate predictions
    valid_preds = model.predict(X_valid)
    test_predictions += model.predict(X_test) / splits
    
    rmse = mean_squared_error(y_valid, valid_preds, squared=False)
    mean_rmse += rmse / splits
    print(fold, rmse)
    
print("Mean RMSE:", mean_rmse)

## Submission

In [None]:
submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
submission.target = test_predictions

submission.to_csv('submission.csv', index=False)
submission.head()