In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Decided to join this competition just before the end date so everything was thrown together quickly.
An Optuna study was used for hyperparameter tuning in another notebook and that code is pasted here for reference.

In [None]:
training_set = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv", index_col= 'id')
testing_set = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv", index_col = 'id')

In [None]:
#Use info to get a quick preview of our data 
#note: there are no null values and every column is an int
# with pd.option_context('display.max_rows', 101):
print(training_set.info())

In [None]:
training_set.describe()

In [None]:
#plot histogram of our data to get an idea of what the different features look like
training_set.hist(bins = 50, figsize = (20,15))

we can see the data has many different distributions and scales, which we will deal with later 

In [None]:
#check to see if any attributes appear correlated to eachother
#we also want to remove the self correlated values from the matrix
corr_matrix = training_set.corr()

# Retain upper triangular orrelation matrix and make lower values null so we can drop them
upper_corr_matrix = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Convert to series and drop Null values
unique_corr_series = upper_corr_matrix.unstack().dropna()

#then unstack the unique pairs and sort them
sorted_matrix = unique_corr_series.sort_values()

print(sorted_matrix)

In [None]:
#check to see if any attributes appear correlated to our target
corr_matrix = training_set.corr()
corr_matrix['loss'].sort_values(ascending = False)

No features appear strongly correlated with one another, or with our target

In [None]:
training_set['loss'].dropna(inplace = True)
y = training_set.loss
training_set.drop(['loss'], axis=1, inplace=True)
X = training_set

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8,test_size = 0.2, random_state = 1, stratify = y)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scaler', StandardScaler())
])

In [None]:
from sklearn.preprocessing import OneHotEncoder
# categorical_cols = [col.index for col in X_train.columns if X_train[col].dtype == "object"]
cat_cols = X.select_dtypes(include="object").columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])



In [None]:
from sklearn.compose import ColumnTransformer

num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes(include="object").columns

preprocessor = ColumnTransformer([
    ('numerical', num_transformer, num_cols),
    ('categorical', cat_transformer, cat_cols),
])

Here is the code for hyperparameter tuning using Optuna, pasted from another notebook

    import optuna
        def objective(trial):
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8,test_size = 0.2, random_state = 1)
        #hyperparameters to optimize with optuna
        xgb_params = {
                     "tree_method": trial.suggest_categorical('tree_method', ["gpu_hist"]),
                     "random_state": trial.suggest_categorical('random_state', [42]),
                     "n_estimators": trial.suggest_int('n_estimators', 50, 1050, 100),
                     "verbosity": trial.suggest_categorical('verbosity', [2]),
                     "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.5),
                    'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 200, 50),
                     "n_jobs": trial.suggest_categorical('n_jobs', [4]),
                     "subsample": trial.suggest_float('subsample', 0.1, 0.5),
                     "colsample_bytree": trial.suggest_float('colsample_bytree', 0.1, 0.5),
                     "max_depth": trial.suggest_int("max_depth", 2, 20),
                     "booster": trial.suggest_categorical('booster', ["gbtree"]),
                     "reg_lambda": trial.suggest_float('reg_lambda', 2, 100),
                     "reg_alpha": trial.suggest_float('reg_alpha', 1, 50),
                    'gamma': trial.suggest_loguniform('gamma', 1e-4,1e4),
                    'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4,1e4)
                     }

        # Model loading 
        model = XGBRegressor(**xgb_params)
        
        #fit model
        model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  eval_metric="rmse",
                 early_stopping_rounds=100, 
                  verbose=2)
        return rmse(y_valid, model.predict(X_valid))

        Best parameters: {'model__early_stopping_rounds': 100, 'model__gamma': 2, 'model__max_depth': 2, 
        'model__min_child_weight': 10, 'model__n_estimators': 50}

        study = optuna.create_study(direction='minimize', study_name = 'XGBRegressor')
        study.optimize(objective, timeout=60*60)

        trial = study.best_trial
        print('Best root mean squared error: {}'.format(trial.value))
        print('Best trial\'s parameters: ')
        for key, value in trial.params.items():
            print('{}: {}'.format(key, value))

        #Showing optimization results
        print('Number of finished trials:', len(study.trials))
        print('Best trial parameters:', study.best_trial.params)
        print('Best score:', study.best_value)

->

In [None]:
#BEST PARAMS FOUNT THROUGH OPTUNA STUDY
# Best root mean squared error: 7.860951003209514
# Best trial's parameters: 
params = {
    'tree_method': 'gpu_hist',
    'random_state': 42,
    'n_estimators': 1050,
    'learning_rate': 0.01934606078775565,
    'early_stopping_rounds': 150,
    'n_jobs': 4,
    'subsample': 0.4838418864520116,
    'colsample_bytree': 0.344356785743399,
    'max_depth': 11,
    'booster': 'gbtree',
    'reg_lambda': 14.726431379981303,
    'reg_alpha': 25.499975568916753,
    'gamma': 465.95365559005677,
    'min_child_weight': 5.956504210470665
}

In [None]:
from xgboost import XGBRegressor
#load our XGBRegressor model with best params from the Optuna study
xgb_model = XGBRegressor(verbosity = 2, **params)
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', xgb_model)])


In [None]:
xgb_pipeline.fit(X_train, y_train)
predictions = xgb_pipeline.predict(X_valid)

In [None]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(predictions, y_valid))
print('RMSE:', rmse)

In [None]:
final_pred = xgb_pipeline.predict(testing_set)
output = pd.DataFrame({'Id': testing_set.index,
                       'loss': final_pred})
output.to_csv('submission.csv', index=False)