In [None]:
import pandas as pd
import numpy as np

train_path = r'../input/tabular-playground-series-aug-2021/train.csv'

In [None]:
train = pd.read_csv(train_path, index_col=0)
train

In [None]:
X = train.iloc[:, :-1].values
y = train.iloc[:, -1].values

X.shape, y.shape

# XGBoost

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
rs = 69420

%matplotlib inline

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=rs)
models = []
scores = []
i = 0

for train_index, test_index in kf.split(X):
    i += 1
    print('='*25, f"Training Fold {i}", '='*25)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Remove tree method, single precision and deterministic parameters if you do not have a GPU
    clf = XGBRegressor(tree_method='gpu_hist',
                       single_precision_histogram=True,
                       deterministic_histogram=False,
                       random_state=rs)
    
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False)
    
    y_pred = clf.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred, squared=False)
    
    scores.append(mse)
    models.append(clf)
    print(f"Fold {i} RMSE: {mse}")

print(f"Mean RMSE: {round(np.mean(scores), 5)}")

In [None]:
plt.plot(scores)
plt.title("RMSE Per Fold for XGBoost")
plt.xlabel("Fold")
plt.ylabel("RMSE")
plt.show()

print(f"Mean RMSE: {round(np.mean(scores), 5)}")

# Lightgbm

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
rs = 69420

%matplotlib inline

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=rs)
models = []
scores = []
i = 0

for train_index, test_index in kf.split(X):
    i += 1
    print('='*25, f"Training Fold {i}", '='*25)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # device parameters if you do not have a GPU
    clf = LGBMRegressor(device='gpu',
                        random_state=rs)
    
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False)
    
    y_pred = clf.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred, squared=False)
    
    scores.append(mse)
    models.append(clf)
    print(f"Fold {i} RMSE: {mse}")

print(f"Mean RMSE: {round(np.mean(scores), 5)}")

In [None]:
plt.plot(scores)
plt.title("RMSE Per Fold for Lightgbm")
plt.xlabel("Fold")
plt.ylabel("RMSE")
plt.show()

print(f"Mean RMSE: {round(np.mean(scores), 5)}")

# Catboost

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
rs = 69420

%matplotlib inline

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=rs)
models = []
scores = []
i = 0

for train_index, test_index in kf.split(X):
    i += 1
    print('='*25, f"Training Fold {i}", '='*25)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # remove task_type parameters if you do not have a GPU
    clf = CatBoostRegressor(task_type='GPU',
                            random_state=rs)
    
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False)
    
    y_pred = clf.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred, squared=False)
    
    scores.append(mse)
    models.append(clf)
    print(f"Fold {i} RMSE: {mse}")

print(f"Mean RMSE: {round(np.mean(scores), 5)}")

In [None]:
plt.plot(scores)
plt.title("RMSE Per Fold for Catboost")
plt.xlabel("Fold")
plt.ylabel("RMSE")
plt.show()

print(f"Mean RMSE: {round(np.mean(scores), 5)}")

# Final Scores

- XGBoost: 7.88824
- LightGBM: 7.86141
- Catboost: 7.90038