


<div style="display:fill;
           background-color:#3b48c710;
           letter-spacing:0.5px;border-bottom: 2px solid white;">
<img src="https://images.unsplash.com/photo-1582623838120-455da222cdc7?q=80&h=500&w=2000&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D">
    
<H1 style="padding: 10px; color:white; font-weight:600;font-family: 'Garamond', 'Lucida Sans', sans-serif; text-align: center; font-size: 42px;">Regression with an Abalone Dataset</H1>
</div>


In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
import os
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style("dark") # Theme for plots as Dark
sns.set_palette("viridis")
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, mean_squared_log_error
from sklearn.svm import OneClassSVM
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor, HistGradientBoostingRegressor, IsolationForest
import optuna
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from catboost import Pool, CatBoostRegressor, cv
import sys
from tqdm import tqdm

<div style="background-color: #8b888711; padding: 16px; border-radius: 12px; border: 2px solid white;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: white; font-weight: bold; font-size: 42px;">Dataset Overview</h1>

</div>

In [None]:
train_data = pd.read_csv("/kaggle/input/playground-series-s4e4/train.csv",index_col="id")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e4/test.csv",index_col="id")
orig_data = pd.read_csv("/kaggle/input/ps-4-e-2-abalone-dataset-from-uci/abalone.data")
columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']

orig_data.columns = train_data.columns = columns
test_data.columns = columns[:-1]

train_data = pd.concat([train_data,orig_data])

train_data.reset_index(inplace=True,drop=True)

In [None]:
train_data.head()

In [None]:
test_data.head()

<div style="background-color: #8b888711; padding: 16px; border-radius: 12px; border: 2px solid white;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: white; font-weight: bold; font-size: 42px;">Data Preprocessing & Feature Engineering</h1>

</div>

In [None]:
le = LabelEncoder()
train_data["Sex"] = le.fit_transform(train_data["Sex"])
test_data["Sex"]  = le.transform(test_data["Sex"])

train_data["Height"] = train_data["Height"].clip(upper=0.5,lower=0.01)
test_data["Height"] = test_data["Height"].clip(upper=0.5,lower=0.01)

In [None]:
# isoForest = IsolationForest(n_estimators=1000 ,bootstrap=True)
# outliers = isoForest.fit(combined_data.drop(["Sex","Rings"],axis=1))
# outliers_train = isoForest.predict(train_data.drop(["Sex","Rings"],axis=1))
# outliers_test = isoForest.predict(test_data.drop(["Sex"],axis=1))

In [None]:
# print("> Shape before Outlier Removal",train_data.shape)

# THRESHOLD = 2.5
# for i in train_data.select_dtypes("float").columns:
#     if i == "Rings":
#         continue
#     Q1 = combined_data[i].quantile(0.25)
#     Q3 = combined_data[i].quantile(0.75)
#     IQR = Q3-Q1
#     upper_limit = Q3+THRESHOLD*IQR
#     lower_limit = Q1-THRESHOLD*IQR
# #     train_data[i] = train_data[i].clip(upper = upper_limit,lower = lower_limit)
#     test_data[i] = test_data[i].clip(upper = upper_limit,lower = lower_limit)
#     train_data = train_data.query(f"{i}<={upper_limit} & {i}>={lower_limit}")
    
# print("> Shape after Outlier Removal",train_data.shape)
# print("No change bcoz only Extreme Outliers are clipped")

In [None]:
# combined_data = pd.concat([train_data,test_data])

# cols = list(train_data.columns)
# cols.remove("Sex")
# cols.remove("Rings")

# quanTransformer = QuantileTransformer(n_quantiles=2000,output_distribution="normal")
# quanTransformer.fit(combined_data[cols])

# qtransTrain = quanTransformer.transform(train_data[cols])+6
# qtransTrain = pd.DataFrame(qtransTrain,columns=cols)
# qtransTrain["Sex"] = train_data["Sex"]
# qtransTrain["Rings"] = train_data["Rings"]
# train_data = qtransTrain

# qtransTest = quanTransformer.transform(test_data[cols])+6
# qtransTest = pd.DataFrame(qtransTest,columns=cols)
# qtransTest["Sex"] = test_data["Sex"]
# test_data = qtransTest

In [None]:
allCombinations = [('Shucked_weight', 'Viscera_weight'), ('Shell_weight', 'Shucked_weight'), ('Viscera_weight', 'Shell_weight'), ('Shucked_weight', 'Whole_weight'), ('Viscera_weight', 'Whole_weight'), ('Shell_weight', 'Whole_weight')]

for num,den in allCombinations:
    train_data[f"{num[:-7]}/{den[:-7]}_log"] = np.log(train_data[f"{num}"]/train_data[f"{den}"])
    test_data[f"{num[:-7]}/{den[:-7]}_log"] = np.log(test_data[f"{num}"]/test_data[f"{den}"])


train_data["Volume"] = train_data["Length"]*train_data["Diameter"]*train_data["Height"]
train_data["Volume_log"] = np.log(train_data["Volume"])
train_data["Density"] = train_data["Whole_weight"]/train_data["Volume"]

test_data["Volume"] = test_data["Length"]*test_data["Diameter"]*test_data["Height"] 
test_data["Volume_log"] = np.log(test_data["Volume"])
test_data["Density"] = test_data["Whole_weight"]/test_data["Volume"]

In [None]:
train_data.describe()

<div style="background-color: #8b888711; padding: 16px; border-radius: 12px; border: 2px solid white;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: white; font-weight: bold; font-size: 42px;">Exploratory Data Analysis</h1>

</div>

In [None]:
mask = np.triu(np.ones_like(train_data.corr()))
plt.figure(figsize=(20,12))
sns.heatmap(train_data.corr(), cmap="viridis_r", annot=True, mask=mask,vmin=-1,vmax=1);

In [None]:
plt.subplots(18,2,figsize=(12,45),width_ratios=(2,1));

for ind,col in enumerate(train_data.columns):
    if col == "Sex":
        plt.subplot(18,2,2*ind+1)
        sns.violinplot(x=col, y='Rings', data=train_data)
        plt.subplot(18,2,2*ind+2)
        sns.countplot(data = train_data,x=col,hue=col);
        plt.title(col)
        
    else:
        plt.subplot(18,2,2*ind+1)
        sns.histplot(train_data[col],kde=True,bins=40);
        plt.subplot(18,2,2*ind+2)
        sns.boxplot(train_data[col]);
        plt.title(col)
        
plt.tight_layout()

<div style="background-color: #8b888711; padding: 16px; border-radius: 12px; border: 2px solid white;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: white; font-weight: bold; font-size: 42px;">Training Models</h1>

</div>

In [None]:
seed = np.random.seed(6)

X = train_data.drop(["Rings"],axis=1)
y = train_data["Rings"]

In [None]:
test_data

<h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #BBBBBB; font-weight: bold; font-size: 36px;">
   4.1 Baseline Models
</h1>
<hr style="color: #BBBBBB;">

In [None]:
lgbmmodel = LGBMRegressor(random_state=seed, verbose=-1)
print("CV RMSLE score of LGBM is ",np.sqrt(-cross_val_score(lgbmmodel,X,y,cv=4, scoring = 'neg_mean_squared_log_error').mean()))

In [None]:
xgbmodel = XGBRegressor(random_state=seed)
print("CV RMSLE score of XGB is ",np.sqrt(-cross_val_score(xgbmodel,X,y,cv=4, scoring = 'neg_mean_squared_log_error').mean()))

In [None]:
catmodel = CatBoostRegressor(random_state=seed, verbose=0)
print("CV RMSLE score of CAT is ",np.sqrt(-cross_val_score(catmodel,X,y,cv=4, scoring = 'neg_mean_squared_log_error').mean()))

<h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #BBBBBB; font-weight: bold; font-size: 36px;">
   4.2 Generating Extra Training Data from Test Data
</h1>
<hr style="color: #BBBBBB;">

In [None]:
cols = test_data.columns

for fold in range(8):
    print(f"> Generating Fold {fold+1}")
    print(f"  Initial Size = {train_data.shape[0]}",end=" | ")
    
    lgbmmodel.fit(X,y)
    xgbmodel.fit(X,y)
    catmodel.fit(X,y)
    
    extra_train = test_data.copy()
    extra_train["LGBM"] = lgbmmodel.predict(extra_train[cols])
    extra_train["XGB"] = xgbmodel.predict(extra_train[cols])
    extra_train["CAT"] = catmodel.predict(extra_train[cols])
    extra_train["STD"] = np.std(extra_train[["LGBM","XGB","CAT"]],axis=1)
    extra_train["MEAN"] = np.mean(extra_train[["LGBM","XGB","CAT"]],axis=1)

    STD_THRESHOLD = extra_train["STD"].quantile(0.6)
    extra_train = extra_train[extra_train["STD"]<=STD_THRESHOLD]

    MEAN_THRESHOLD = 0.2
    extra_train = pd.concat([extra_train[extra_train["MEAN"]%1<MEAN_THRESHOLD],extra_train[extra_train["MEAN"]%1>(1-MEAN_THRESHOLD)]])
    extra_train["Rings"] = np.round(extra_train["MEAN"])

    train_data = pd.concat([train_data,extra_train[train_data.columns]])
    train_data.drop_duplicates(inplace=True)
    train_data.reset_index(inplace=True,drop=True)
    print(f"Final Size = {train_data.shape[0]}\n")

    X = train_data.drop(["Rings"],axis=1)
    y = train_data["Rings"]

In [None]:
train_data = train_data.sample(frac=1.0)
X = train_data.drop(["Rings"],axis=1)
y = train_data["Rings"]

In [None]:
print("New CV RMSLE score of LGBM is ",np.sqrt(-cross_val_score(lgbmmodel,X,y,cv=4, scoring = 'neg_mean_squared_log_error').mean()))
print("New CV RMSLE score of XGB is ",np.sqrt(-cross_val_score(xgbmodel,X,y,cv=4, scoring = 'neg_mean_squared_log_error').mean()))
print("New CV RMSLE score of CAT is ",np.sqrt(-cross_val_score(catmodel,X,y,cv=4, scoring = 'neg_mean_squared_log_error').mean()))

<h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #BBBBBB; font-weight: bold; font-size: 36px;">
   4.3 Optuna-Tuning Models
</h1>
<hr style="color: #BBBBBB;">

In [None]:
# # LGBM 
# def objective(trial):
#     lgbm_params = {
#         "random_state": seed,
#         'n_estimators' : 5000,        
#         "max_depth":trial.suggest_int('max_depth',5,50),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-3, 0.1, log=True),
#         "min_child_weight" : trial.suggest_float('min_child_weight', 0.5,4),
#         "min_child_samples" : trial.suggest_int('min_child_samples',1,250),
#         "subsample" : trial.suggest_float('subsample', 0.2, 1),
#         "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         'num_leaves' : trial.suggest_int('num_leaves', 8, 64),
#         'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
#         'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
#         "metric": trial.suggest_categorical("metric", ["rmse","huber","quantile"]),
#         "boosting_type": "gbdt",    
#         "objective":'regression',
#         "device": "gpu",
#         "verbose": -1,
#         "early_stopping_rounds" : 1000
#     }
#     score = []
#     for i,(tr,val) in tqdm(enumerate(RepeatedStratifiedKFold(n_splits=4, n_repeats=1,random_state=seed).split(X,y)),total = 4):
#         X_train, X_test, y_train, y_test = X.iloc[tr,:],X.iloc[val,:],y.iloc[tr],y.iloc[val]

#         lgbmmodel = LGBMRegressor(**lgbm_params)
#         lgbmmodel.fit(X_train,y_train, eval_set=[(X_test,y_test)], eval_names=["valid"],eval_metric=['MSLE'])
#         msle = mean_squared_log_error(y_test, lgbmmodel.predict(X_test))
#         rmsle = np.sqrt(msle)
#         score.append(rmsle)
#     print(f" > RMSLE of LGBM =", score, file = sys.stderr)
#     return np.mean(score)

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100,timeout=5000)

In [None]:
lgbm_params = {
    'n_estimators' : 15000,  
    "random_state": seed,
    "boosting_type": "gbdt",    
    "objective":'regression',
    "device": "gpu",
    "verbose": -1,
    "early_stopping_rounds" : 4000,
    'max_depth': 9,
    'learning_rate': 0.0754689136929529,
    'min_child_weight': 2.9774820924588674,
    'min_child_samples': 172,
    'subsample': 0.749283862376052,
    'subsample_freq': 0,
    'colsample_bytree': 0.5668465666039963,
    'num_leaves': 18,
    'lambda_l1': 4.011146777594568e-05,
    'lambda_l2': 0.18342984449081373,
    'metric': 'huber'
}

In [None]:
# # XGB 
# def objective(trial):
#     xgb_params = {
#         'n_estimators' : 5000,
#         'max_depth':  trial.suggest_int('max_depth',3,8),
#         "max_bin": trial.suggest_int('max_bin',128,512),
#         'subsample': trial.suggest_float('subsample', 0.2, 1),
#         'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
#         'gamma': trial.suggest_float("gamma", 1e-4, 1.0,log = True),
#         'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
#         'min_child_weight': trial.suggest_float('min_child_weight', 2,4),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-3, 0.2,log=True),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
#         "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
#         "grow_policy" : trial.suggest_categorical("grow_policy",["depthwise","lossguide"]),
#         "objective" : trial.suggest_categorical("objective",["reg:quantileerror","reg:squaredlogerror","reg:squarederror"]),
#         "tree_method" : "gpu_hist",
#         "early_stopping_rounds" : 1000,
#         "random_state" : seed,
#         "eval_metric": "rmsle",
#         "verbosity" :  0,
#     }
#     if xgb_params["objective"] == "reg:quantileerror":
#         xgb_params["quantile_alpha"] = trial.suggest_float('quantile_alpha', 0.1, 1.0, log=True)

#     score = []
#     for i,(tr,val) in tqdm(enumerate(RepeatedStratifiedKFold(n_splits=4, n_repeats=1,random_state=seed).split(X,y)),total = 4):
#         X_train, X_test, y_train, y_test = X.iloc[tr,:],X.iloc[val,:],y.iloc[tr],y.iloc[val]

#         xgbmodel = XGBRegressor(**xgb_params)
#         xgbmodel.fit(X_train,y_train, eval_set=[(X_test,y_test)],verbose=0,
#                      callbacks=[EarlyStopping(rounds = xgb_params["early_stopping_rounds"],save_best=True)])

#         msle = mean_squared_log_error(y_test, xgbmodel.predict(X_test))
#         rmsle = np.sqrt(msle)
#         score.append(rmsle)
#     print(f" > RMSLE of XGB =", score, file = sys.stderr)
#     return np.mean(score)

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100,timeout=5000)

In [None]:
xgb_params = {
    'n_estimators' : 15000,
    'max_depth': 7,
    'max_bin': 461,
    'subsample': 0.8670561876025071,
    'alpha': 0.5241810316500617,
    'gamma': 0.00496109442006053,
    'lambda': 7.078493302377542e-07,
    'min_child_weight': 2.5614435070408734,
    'learning_rate': 0.001632097029810397,
    'colsample_bytree': 0.5467512442909263,
    'colsample_bylevel': 0.903483301595809,
    'colsample_bynode': 0.5754876788766643,
    'grow_policy': 'depthwise',
    'objective': 'reg:squarederror',
    "tree_method" : "gpu_hist",
    "early_stopping_rounds" : 4000,
    "random_state" : seed,
    "eval_metric": "rmsle",
    "verbosity" :  0,
}

In [None]:
# def objective(trial):
#     cat_params = {
#         "iterations": 5000,
#         "verbose": False,
#         'depth': trial.suggest_int('depth', 6, 20), 
#         'max_bin': trial.suggest_int("max_bin", 20, 256), 
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.5, 8.0), 
#         "min_data_in_leaf": trial.suggest_int('min_data_in_leaf', 1, 100),         
#         'random_strength': trial.suggest_float('random_strength', 0.5, 5.0), 
#         "learning_rate": trial.suggest_float('learning_rate', 1e-2, 0.2, log=True), 
#         "max_leaves": trial.suggest_int('max_leaves', 8, 256), 
# #         "sampling_unit": trial.suggest_categorical("sampling_unit",["Object","Group"]),
#         "eval_metric": trial.suggest_categorical("eval_metric",["RMSE","Quantile","MSLE"]),
#         "loss_function": trial.suggest_categorical("loss_function",["RMSE","Quantile"]),
# #         "grow_policy": trial.suggest_categorical('grow_policy', ['Lossguide', 'SymmetricTree']), 
# #         "sampling_frequency": trial.suggest_categorical('sampling_frequency', ['PerTree', 'PerTreeLevel']), 
# #         "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "Poisson"]), # Poisson only on GPU
#         "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bernoulli", "Poisson"]), # Poisson only on GPU
#         "task_type": "GPU",
#         "od_type": "Iter",
#         "random_state": seed,
#         "early_stopping_rounds": 1000,
#         "grow_policy": 'Lossguide' 
        
#     }
#     #     cat_params = {
# #         'depth': trial.suggest_int('depth', 3, 12), 
# #         'max_bin': trial.suggest_int("max_bin", 20, 500), 
# #         "max_leaves": trial.suggest_int('max_leaves', 4, 48), 
# #         "learning_rate": trial.suggest_float('learning_rate', 1e-4, 0.2), 
# #         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0), 
# #         'random_strength': trial.suggest_float('random_strength', 0.1, 10.0), 
# #         "min_data_in_leaf": trial.suggest_int('min_data_in_leaf', 1, 100), 
# # #         "grow_policy": trial.suggest_categorical('grow_policy', ['Lossguide', 'SymmetricTree']), 
# # #         "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli"]), 
# #         "grow_policy": 'Lossguide',
# #         "bootstrap_type": "Bernoulli", 
# #         "subsample": trial.suggest_float("subsample", 0.2, 1)
# #     }
    
#     if cat_params["bootstrap_type"] == "Bayesian":
#         cat_params["bagging_temperature"] = trial.suggest_float('bagging_temperature', 0.1, 100.0)
#     else:
#         cat_params["subsample"] : trial.suggest_float("subsample", 0.2, 1.0)
            
# #     if cat_params["grow_policy"] == "Lossguide":
        
#     score = []
#     for i,(tr,val) in tqdm(enumerate(RepeatedStratifiedKFold(n_splits=5, n_repeats=1,random_state=seed).split(X,y)),total = 5):
#         X_train, X_test, y_train, y_test = X.iloc[tr,:],X.iloc[val,:],y.iloc[tr],y.iloc[val]
        
#         train_dataset = Pool(data=X.iloc[tr,:],label=y.iloc[tr])
#         eval_dataset = Pool(data=X.iloc[val,:],label=y.iloc[val])
    
#         catmodel = CatBoostRegressor(**cat_params)
#         catmodel.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)
        
#         msle = mean_squared_log_error(y.iloc[val], catmodel.predict(X.iloc[val,:]))
#         rmsle = np.sqrt(msle)
#         score.append(rmsle)

#     print(f" > RMSLE of CAT =", score, file = sys.stderr)
#     return np.mean(score)
    
    
# study = optuna.create_study(direction='minimize') 
# study.optimize(objective, n_trials=100,timeout=8000)  

In [None]:
cat_params = {
    "iterations": 15000,
    "verbose": False,
    'depth': 9,
    'max_bin': 256,
    'l2_leaf_reg': 7.790295416219466,
    'min_data_in_leaf': 75,
    'random_strength': 4.976636729079349,
    'learning_rate': 0.010899898540303064,
    'max_leaves': 195,
    'eval_metric': 'Quantile',
    'loss_function': 'RMSE',
    'bootstrap_type': 'Bernoulli',
    "grow_policy": 'Lossguide',
    "task_type": "GPU",
    "random_state": seed,
    "early_stopping_rounds": 4000
}

<h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #BBBBBB; font-weight: bold; font-size: 36px;">4.4 Tuning Ensembling Weights</h1>
<hr style="color: #BBBBBB;">

In [None]:
# def objective(trial):

#     xgb_wt =  trial.suggest_float('xgb_wt',0,10)
#     lgbm_wt = trial.suggest_float('lgbm_wt',0,10)
#     cat_wt = trial.suggest_float('cat_wt',0,10)
#     RMSLE = []

#     for i,(tr,val) in tqdm(enumerate(RepeatedStratifiedKFold(n_splits=4, n_repeats=1,random_state=seed).split(X,y)),total = 4):

#         X_train, X_test, y_train, y_test = X.iloc[tr,:],X.iloc[val,:],y.iloc[tr],y.iloc[val]
        
#         print(f"\nLGBM_{i+1}",end=" | ", file = sys.stderr)
#         lgbmmodel = LGBMRegressor(**lgbm_params)
#         lgbmmodel.fit(X_train,y_train, eval_set=[(X_test,y_test)], eval_names=["valid"],eval_metric=['MSLE'])

#         print(f"CAT_{i+1}",end=" | ", file = sys.stderr)
#         train_dataset = Pool(data=X.iloc[tr,:],label=y.iloc[tr])
#         eval_dataset = Pool(data=X.iloc[val,:],label=y.iloc[val])
#         catmodel = CatBoostRegressor(**cat_params)
#         catmodel.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)

#         print(f"XGB_{i+1}", end = "", file = sys.stderr)
#         xgbmodel = XGBRegressor(**xgb_params)
#         xgbmodel.fit(X_train,y_train, eval_set=[(X_test,y_test)],verbose = 0,callbacks=[EarlyStopping(rounds = 2500,save_best=True)])

#         xgb_preds = xgbmodel.predict(X_test)
#         lgbm_preds = lgbmmodel.predict(X_test)
#         cat_preds = catmodel.predict(X_test)

#         preds = ((xgb_wt*xgb_preds)+(lgbm_wt*lgbm_preds)+(cat_wt*cat_preds))/(xgb_wt+cat_wt+lgbm_wt)
#         msle = mean_squared_log_error(y_test, preds)

#         RMSLE.append(np.sqrt(msle))
#     return np.mean(RMSLE)

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=200,timeout=5000)

In [None]:
xgb_wt = 4.9723780027756765
lgbm_wt = 5.131118018542441
cat_wt = 5.944096910451613
total_wt = xgb_wt+lgbm_wt+cat_wt

<h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #BBBBBB; font-weight: bold; font-size: 36px;">4.5 Out-of-Fold Predictions LGBM + CatBoost + XGB</h1>
<hr style="color: #BBBBBB;">

In [None]:
submission = pd.DataFrame()
submission["id"] = test_data.index
submission["Rings"] = 0

In [None]:
SPLITS = 5
REPEATS = 1
lgbm_score = []
cat_score = []
xgb_score = []

for i,(tr,val) in enumerate(RepeatedStratifiedKFold(n_splits=SPLITS, n_repeats=REPEATS,random_state=seed).split(X,y)):
    
    print("-"*30,f"FOLD {i+1}/{SPLITS*REPEATS}","-"*30)
    X_train, X_test, y_train, y_test = X.iloc[tr,:],X.iloc[val,:],y.iloc[tr],y.iloc[val]
    
    print("\n->","LGBM:")
    lgbmmodel = LGBMRegressor(**lgbm_params)
    lgbmmodel.fit(X_train,y_train, eval_set=[(X_test,y_test)], eval_names=["valid"],eval_metric=['MSLE'])
    msle = mean_squared_log_error(y_test, lgbmmodel.predict(X_test))
    rmsle = np.sqrt(msle)
    lgbm_score.append(rmsle)
    print(f"Fold {i+1} RMSLE of LGBM =", rmsle,"\n")
    submission["Rings"] += lgbm_wt*lgbmmodel.predict(test_data)
    
    print("\n->","CAT:")
    train_dataset = Pool(data=X.iloc[tr,:],label=y.iloc[tr])
    eval_dataset = Pool(data=X.iloc[val,:],label=y.iloc[val])
    
    catmodel = CatBoostRegressor(**cat_params)
    catmodel.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)
    msle = mean_squared_log_error(y.iloc[val], catmodel.predict(X.iloc[val,:]))
    rmsle = np.sqrt(msle)
    cat_score.append(rmsle)
    print(f"Fold {i+1} RMSLE of CAT =", rmsle,"\n")
    submission["Rings"] += cat_wt*catmodel.predict(test_data)
    
    print("\n->","XGB:")
    xgbmodel = XGBRegressor(**xgb_params)
    xgbmodel.fit(X_train,y_train, eval_set=[(X_test,y_test)],verbose = 0,callbacks=[EarlyStopping(rounds = 4000,save_best=True)])
    
    msle = mean_squared_log_error(y_test, xgbmodel.predict(X_test))
    rmsle = np.sqrt(msle)
    xgb_score.append(rmsle)
    print(f"Fold {i+1} RMSLE of XGB =", rmsle,"\n")
    submission["Rings"] += xgb_wt*xgbmodel.predict(test_data)
    
print("\n\n","-"*50,sep="")
print("CV score of LGBM is ",np.array(lgbm_score).mean())
print("CV score of CAT is ",np.array(cat_score).mean())
print("CV score of XGB is ",np.array(xgb_score).mean())

<div style="background-color: #8b888711; padding: 16px; border-radius: 12px; border: 2px solid white;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: white; font-weight: bold; font-size: 42px;"> Blending Output</h1>

</div>

In [None]:
submission["Rings"] = submission["Rings"]/(SPLITS*REPEATS*total_wt)

In [None]:
public_work1 = pd.read_csv("/kaggle/input/ps4e4-prediction-generalization-regression/submission.csv")
public_work2 = pd.read_csv("/kaggle/input/random-search-neural-network-abalone/submission.csv")

submission["PB1"] = public_work1["Rings"]
submission["PB2"] = public_work2["Rings"]
submission["Round"] = np.round(submission["Rings"])

sns.pairplot(submission.drop(["id"],axis=1));

In [None]:
# rounded_rings = []
# for i in range(len(submission)):
#     r = submission.iloc[i]["Rings"]
#     if (r%1<=0.2 or r%1>=0.8):
#         rounded_rings.append(round(r))
#     else:
#         rounded_rings.append(r)
        
# submission["Rings"] = rounded_rings

In [None]:
submission["Rings"] = 0.4*submission["Rings"] + 0.1*submission["Round"] + 0.25*submission["PB1"] + 0.25*submission["PB2"]

submission[["id","Rings"]].to_csv("submission.csv",header=True,index=False)