In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit

from sklearn.linear_model import LinearRegression

from catboost import CatBoostRegressor
import catboost

from xgboost import XGBRegressor

from lightgbm import LGBMRegressor

from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")
samp_sub = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
train.shape

In [None]:
X = train.drop(['id', 'loss'], axis = 1)
y = train['loss']

In [None]:
X_scaled = StandardScaler().fit_transform(X)
Test_scaled = StandardScaler().fit_transform(test.drop(['id'], axis = 1))

#### Check percentage of variance explained data depending from number of components
##### Spoiler: As linear as posible:)

In [None]:
number = [10, 30, 60, 80, 100]
for i in number:
    pca = PCA(n_components = i)
    pca.fit(X_scaled)
    print('Number of components {}, Percentage of variance explained {}'.format(i, sum(pca.explained_variance_ratio_)))

#### Split data to train/test datasets (uncomment if you needed)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled[29000:30000], y[29000:30000], shuffle = True)

#### Run grid search and try to messument time 

In [None]:
%%time
#CatBoost + GridSearchCV  model

# cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )

Cat = CatBoostRegressor(logging_level = 'Silent',
                          loss_function='RMSE',
                          learning_rate = 0.01,
                          depth = 4,
                          n_estimators = 350)

# grid = {'learning_rate': [0.1],
#         'depth': [4, 6, 10],
#         'n_estimators': [500, 1500, 2000]}

# grid_search_result = model.grid_search(grid, 
#                                        X=X_train, 
#                                        y=Y_train,
#                                        cv = cv_split, 
#                                        plot=True)

# model.fit(X_train, Y_train, 
#           verbose=False)

In [None]:
%%time
#LGBMRegressor + GridSearchCV  model

# cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )

# grid_n_estimator = [310]
# grid_learn = [.001]

# LGBM = LGBMRegressor(boosting_type='gbdt',
#                      objective='regression',
#                      metric='auc',
#                      n_jobs = -1)
LGBM = LGBMRegressor(boosting_type='gbdt',
                      objective='regression',
                      metric='auc',
                      n_estimators = 310,
                      max_depth = 8,
                      learning_rate = 0.001, 
                      n_jobs = -1)
# G_LGBM = GridSearchCV(LGBM, param_grid= {'learning_rate': grid_learn,
#                                          'n_estimators': grid_n_estimator,
#                                          'max_depth': [8]},
#                    cv=cv_split)

# G_LGBM.fit(X_train, Y_train)
# print('Best Parameters: ', G_LGBM.best_params_)
# pred = G_LGBM.predict(X_test)

In [None]:
%%time
#XGBoost + GridSearchCV  model

# cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )

# grid_n_estimator = [400, 500, 600]
# grid_learn = [.001]

# XGB = XGBRegressor()
XGB = XGBRegressor(n_estimators = 600,
                    max_depth = 4,
                    learning_rate = 0.013, 
                   n_jobs = -1)
# G_XGB = GridSearchCV(XGB, param_grid= {'learning_rate': grid_learn, 
#                                        'max_depth': [4], 
#                                        'n_estimators': grid_n_estimator},
#                      cv = cv_split)


# G_XGB.fit(X_train, Y_train)
# print('Best Parameters: ', G_XGB.best_params_)
# pred = G_XGB.predict(X_test)

In [None]:
# cv_split = ShuffleSplit(n_splits = 10, 
#                         test_size = .3, 
#                         train_size = .6, 
#                         random_state = 0 )

estimators = [('XGB', XGB),
              ('Cat', Cat), 
              ('LGBM', LGBM)] 

stacked = StackingRegressor(estimators = estimators, 
                            final_estimator = LinearRegression(),
                            verbose = 4
#                             cv = cv_split
                           )

stacked.fit(X_train, Y_train)

pred = stacked.predict(X_test)

#### Checkout some metrics

In [None]:
print('r2 score: ',r2_score(Y_test, pred))
print('mean_absolute_error: ',mean_absolute_error(Y_test, pred))
print('median_absolute_error: ',median_absolute_error(Y_test, pred))

In [None]:
pred = stacked.predict(Test_scaled)
test['id'].values.shape, pred.shape, samp_sub.columns.to_list()

In [None]:
result = pd.DataFrame(pred)
result= pd.concat([test['id'], result], axis = 1)
result.columns = samp_sub.columns.to_list()

In [None]:
result.to_csv("Simple_Caboost.csv", index=False)