# **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold
from sklearn import model_selection
import lightgbm as lgbm
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import optuna
import tqdm
import warnings
warnings.filterwarnings("ignore")

# **Reading the Training Data and Extracting Useful Features**

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
y = train_data.loss

useful_features = [col for col in train_data.columns if col not in ('id', 'loss')]
X = train_data[useful_features]
X.head()

#scaler = StandardScaler()
#X[useful_features] = scaler.fit_transform(X[useful_features])

#pca = PCA(n_components = 40)
#X = pca.fit_transform(X)

#train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# **Reading the Test Data**

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

test_X = test_data[useful_features]
test_X.head()

In [None]:
X.shape, y.shape, test_X.shape

# **K Fold CV and Implementation of Catboost Regression Algorithm**

In [None]:
#Setting the kfold parameters
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)

oof_preds = np.zeros((X.shape[0],))
preds_cat = 0
model_fi = 0
mean_rmse = 0

for num, (train_id, valid_id) in enumerate(kf.split(X)):
    X_train, X_valid = X.loc[train_id], X.loc[valid_id]
    y_train, y_valid = y.loc[train_id], y.loc[valid_id]
    
    cat_model = CatBoostRegressor(random_state=42)
    cat_model.fit(X_train, y_train,
                  verbose = False,
                 eval_set = [(X_train, y_train), (X_valid, y_valid)],
                 early_stopping_rounds = 100)
    
    #Mean of the predictions
    preds_cat += cat_model.predict(test_X).reshape(-1,) / 10 # Splits
    
    #Mean of feature importance
    model_fi += cat_model.feature_importances_ / 10 #splits
    
    #Out of Fold predictions
    oof_preds[valid_id] = cat_model.predict(X_valid)
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_id]))
    print(f"Fold {num} | RMSE: {fold_rmse}")
    
    mean_rmse += fold_rmse / 10
    
print(f"\nOverall RMSE: {mean_rmse}")

# **Setting the XGBoost Hyperparameters**

In [None]:
xgb_params = {
    'lambda': 67.79737006663706,
    'alpha': 40.12405005448161,
    'colsample_bytree': 0.061613774851329205,
    'subsample': 0.9556736521337416,
    'learning_rate': 0.17024722721525629,
    'n_estimators': 9489,
    'objective': 'reg:squarederror',
    'max_depth': 3,
    'gamma': 2,
    'booster': 'gbtree',
    'min_child_weight': 155,
    'random_state': 42,
    'n_jobs': 4,
    'sampling_method': 'uniform'
}

# **K Fold CV and Implementation of XGBoost Regression Algorithm** 

In [None]:
#Setting the kfold parameters
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)

oof_preds = np.zeros((X.shape[0],))
preds_xgb = 0
model_fi = 0
mean_rmse = 0

for num, (train_id, valid_id) in enumerate(kf.split(X)):
    X_train, X_valid = X.loc[train_id], X.loc[valid_id]
    y_train, y_valid = y.loc[train_id], y.loc[valid_id]
    
    xgb_model = XGBRegressor(**xgb_params, tree_method='gpu_hist')
    xgb_model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             eval_metric = "rmse",
             early_stopping_rounds = 100)
    
    #Mean of the predictions
    preds_xgb += xgb_model.predict(test_X).reshape(-1,) / 10 # Splits
    
    #Mean of feature importance
    model_fi += xgb_model.feature_importances_ / 10 #splits
    
    #Out of Fold predictions
    oof_preds[valid_id] = xgb_model.predict(X_valid)
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_id]))
    print(f"Fold {num} | RMSE: {fold_rmse}")
    
    mean_rmse += fold_rmse / 10
    
print(f"\nOverall RMSE: {mean_rmse}")

# **Ensembling of XGBoost and CatBoost Regression Predictions**

In [None]:
test_preds = (0.7 * preds_xgb) + (0.3 * preds_cat)

# **Saving the Final Submission File**

In [None]:
output = pd.DataFrame({'id': test_data.id,
                       'loss': test_preds})
output.to_csv('submission_ensemble.csv', index=False)

In [None]:
# 7.88397: 0.7*XGB + 0.3CAT