In [None]:
# Loading libraries

import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

parameters = {'axes.grid': True}
plt.rcParams.update(parameters)

from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor

In [None]:
# Loading sets

df_train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

In [None]:
df_train.head()

In [None]:
# Removing 'id' column

df_train.drop('id', axis = 1, inplace = True)

In [None]:
# Checking 'nulls'

df_train.isnull().sum().max() == 0

In [None]:
df_test.head()

In [None]:
# Removing 'id' column

df_test.drop('id', axis = 1, inplace = True)

In [None]:
# Checking 'nulls'

df_test.isnull().sum().max() == 0

In [None]:
# Plotting some graphs of random features in train set 

fig = plt.figure(figsize = (15, 10))
for j in [j for j in range(1, 16)]:
    i = np.random.randint(0, df_train.columns.size - 1)
    plt.subplot(3, 5, j)
    sns.kdeplot(x = df_train[df_train.columns[i]])
    plt.title(df_train.columns[i])
fig.tight_layout()
print('15 graphs of random features in train set')
plt.show()

In [None]:
# Plotting graph of target

plt.figure(figsize = (15, 5))
sns.histplot(x = df_train['loss'], kde = True)
plt.title('Distribution of target (loss)')

In [None]:
df_train_no_target = df_train.drop('loss', axis = 1)

In [None]:
# Standarization

scaler = StandardScaler()

df_train_no_target_scal = pd.DataFrame(scaler.fit_transform(df_train_no_target), columns = df_train_no_target.columns)
df_test = pd.DataFrame(scaler.fit_transform(df_test), columns = df_test.columns)

In [None]:
df_train_no_target_scal.head()

In [None]:
df_test.head()

In [None]:
X = df_train_no_target_scal
y = df_train['loss']

In [None]:
# Division data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15)

In [None]:
# Parameters improved by optuma (see my other notebooks)

lgb_params_by_optuna = {'subsample': 0.7999999999999999, 'colsample_bytree': 0.7, 'max_depth': 20, 'reg_alpha': 60,
                        'reg_lambda': 40, 'learning_rate': 0.111233326381852, 'n_estimators': 100, 'n_jobs': -1, 'device': 'gpu'}

xgb_params_by_optuna = {'objective': 'reg:tweedie', 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor',
                        'sampling_method': 'gradient_based', 'n_jobs': -1, 'max_bin': 512, 'tweedie_variance_power': 1.0617,
                        'n_estimators': 4672, 'max_depth': 6, 'eta': 0.009492993593819712, 'subsample': 0.4,
                        'colsample_bytree': 0.99, 'colsample_bylevel': 0.81, 'colsample_bynode': 0.71,
                        'min_child_weight': 0.3318029517878049, 'reg_alpha': 19.745301715782514, 'reg_lambda': 2810.458713160978,
                        'max_delta_step': 149.30736703545205, 'gamma': 0.08133291267581277, 'base_score': 0.4767143717295308}

In [None]:
lgb_model = lgb.LGBMRegressor(**lgb_params_by_optuna)
lgb_model.fit(X_train, y_train)
preds_lgb_test = lgb_model.predict(X_test)
preds_lgb_train = lgb_model.predict(X_train)

In [None]:
# RMSE score

print(f" Test RMSE score:     {np.sqrt(mean_squared_error(y_test, preds_lgb_test))}")
print(f" Train RMSE score:    {np.sqrt(mean_squared_error(y_train, preds_lgb_train))}")

In [None]:
def check_model(model, n_splits = 10):
    
    scores = []
    cv = KFold(n_splits, shuffle = True)
    
    for train_idx, test_idx in cv.split(X):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, preds))
        scores.append(score)
        
    print('************************************')    
    print(f"Mean RMSE score:       {np.mean(scores)}")
    print(f"Std RMSE:              {np.std(scores)}")

In [None]:
# Mean RSME score 

check_model(model = lgb_model)

In [None]:
xgb_model = xgb.XGBRegressor(**xgb_params_by_optuna)
xgb_model.fit(X_train, y_train)
preds_xgb_test = xgb_model.predict(X_test)
preds_xgb_train = xgb_model.predict(X_train)

In [None]:
# RMSE score

print(f" Test RMSE score:     {np.sqrt(mean_squared_error(y_test, preds_xgb_test))}")
print(f" Train RMSE score:    {np.sqrt(mean_squared_error(y_train, preds_xgb_train))}")

In [None]:
# Mean RSME score 

check_model(model = xgb_model)

In [None]:
voting_model = VotingRegressor(estimators = [('lgbm', lgb_model), ('xgb', xgb_model)],
                               weights = [0.3, 0.7])

In [None]:
# Mean RSME score 

check_model(model = voting_model)

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
sub = sub.drop('loss', axis = 1)

sub['lgb_preds'] = lgb_model.predict(df_test)
sub['xgb_preds'] = xgb_model.predict(df_test)
sub['voting_preds'] = voting_model.predict(df_test)

sub.sample(5)

In [None]:
# Saving predictions

sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

sub['loss'] = lgb_model.predict(df_test)
sub.to_csv('lgb_7.8555.csv', index = False)

sub['loss'] = xgb_model.predict(df_test)
sub.to_csv('xgb_7.8352.csv', index = False)

sub['loss'] = voting_model.predict(df_test)
sub.to_csv('voting_7.835.csv', index = False)