In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings

#graphs and plots
import matplotlib.pyplot as plt
import seaborn as sns

#model building + optimization
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from bayes_opt import BayesianOptimization
from skopt import BayesSearchCV
import time
import sys

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LGBM + Bayesian Optimization
After running this once and it not going to plan, I am running it again after going over the code more closely. I still don't think this method is that good so any comments and criticisms for improvement are welcome.


## Data Importing

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv', index_col = 'id')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv', index_col = 'id')
sample = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv', index_col = 'id')
test.head()

# LGBM Baseline

Baseline score with no optimization was 7.95860
We will now use Bayesian optimization to try to increase that in a short amount of time.
### Data preprocessing
Since we know from EDA (as seen here <a href="https://www.kaggle.com/subinium/tps-aug-simple-eda">Some awesome EDA not by me</a>), that there are no missing data points, and we are assuming there are no categorical variables (though there are quite a few non-float ones), the data preprocessing is quite simple. 

In [None]:
X = train.drop(['loss'], axis = 1)
y = train['loss']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train.head()

## Parameter Optimization
This method has been inspired by : <a href="https://medium.com/analytics-vidhya/hyperparameters-optimization-for-lightgbm-catboost-and-xgboost-regressors-using-bayesian-6e7c495947a9">Hyperparameters Optimization for LightGBM, CatBoost and XGBoost Regressors using Bayesian Optimization.</a>

In [None]:
%%time

def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=42, n_estimators = 10000, output_process=False):
    #prep data
    train_data = lgb.Dataset(X, label = y, free_raw_data=False)
    
    #parameters
    def lgb_eval(learning_rate, num_leaves, feature_fraction, bagging_fraction, max_depth):
        params = {'application':'regression','metric':'rmse', 'boosting':'gbdt', 'num_iterations': 5000, 'early_stopping_rounds': 500 }
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))

        
        
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified = True, verbose_eval=200, metrics=['rmse'])
        return max(cv_result['rmse-mean'])
    
    lgbBO = BayesianOptimization(lgb_eval,{'learning_rate': (0.01, 0.5),
                                            'num_leaves': (10, 200),
                                            'feature_fraction': (0.1, 1.0),
                                            'bagging_fraction': (0.1, 1.0),
                                            'max_depth': (1, 30),
                                           }, random_state=42)
    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.   
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    return lgbBO.max['params']

opt_params = bayes_parameter_opt_lgb(X_train, y_train, init_round=5, opt_round=10, n_folds=3, random_seed=42,n_estimators=10000)

### Optimal parameters

In [None]:
# We have optimal parameters so now we need to do a cv test with the optimal parameterts
opt_params

In [None]:
op = {'task':'train','application':'regression','metric':'rmse', 'boosting':'gbdt', 'num_iterations': 5000, 'early_stopping_rounds': 500,
    'bagging_fraction': 0.3253848734026177,
 'feature_fraction': 0.30860836505782663,
 'learning_rate': 0.08689261851358254,
 'max_depth': 2,
 'num_leaves': 179
     }


## Modeling
Using the optimal parameters detailed above we train our LGB model before using them to predict from the test set and submitting

In [None]:
trn_data = lgb.Dataset(X_train, label = y_train)
val_data = lgb.Dataset(X_val, label = y_val, reference = trn_data)
regressor = lgb.train(op, trn_data, verbose_eval = 50, valid_sets = val_data)

In [None]:
y_pred = regressor.predict(test)

In [None]:
sub = pd.DataFrame(index=test.index)
sub['loss'] = y_pred
sub.head()

In [None]:
sub.describe()

In [None]:
sub.to_csv('submission.csv')