### This notebooks is test for automated machine learning library
#####  1. Feature Engineering  : featuretools
#####  2. Hyperparameter Tuning : optuna
#####  3. Base algorithms : XGBRegressor
#####    ※ featuretools is not working in kaggle environment. If you want to test this code, download and test in local pc
#####    : woodwork related error

### 1. Load library and Setting config

In [None]:
import pandas as pd
import numpy as np
import warnings
import joblib
warnings.filterwarnings(action='ignore')

import featuretools as ft

from featuretools.selection import (
    remove_highly_correlated_features,
    remove_highly_null_features,
    remove_single_value_features,
)


from sklearn.model_selection import GridSearchCV,KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error

from xgboost import XGBRegressor

import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [None]:
## Config
PCA_N_COMP = 50
K_FOLD_SPLIT = 4
STUDY_TRIAL = 30
CORR_THRESHOLD = 0.85

### 2. Convert CSV -> Parquet format, for reducing Memory usage 

In [None]:
df = pd.read_csv('../input/ubiquant-market-prediction/train.csv')
df.to_parquet('./train.parquet')

In [None]:
%%time
train_df = pd.read_parquet('./train.parquet')
train_df.head()

In [None]:
%%time
train_x = train_df.iloc[:, train_df.columns.str.contains('f_')]
train_x.reset_index(inplace=True)
train_y = train_df['target']
del train_df

### 3. Automated feature engineering using 'featuretools'

In [None]:
es = ft.EntitySet('Ubiquant')
es.add_dataframe(dataframe=train_x, dataframe_name='train_x', index='index')
fm, features = ft.dfs(entityset=es, target_dataframe_name="train_x",
                      trans_primitives=['negate'], agg_primitives=[], max_depth=1)

##### Remove high corr features

In [None]:
new_fm, new_features = remove_highly_correlated_features(fm, features=features, pct_corr_threshold=CORR_THRESHOLD)
new_fm.head()

In [None]:
pd.DataFrame(new_fm.columns).to_csv('column.csv')
train_x = new_fm.copy()
del new_fm

In [None]:
pc = PCA(n_components=PCA_N_COMP)
pc.fit(train_x)
joblib.dump(pc, './pca_model.pkl')
train_x = pc.transform(train_x)

rc = RobustScaler()
rc.fit(train_x)
joblib.dump(rc, './robust_model.pkl')
train_x = rc.transform(train_x)
train_x = pd.DataFrame(data=train_x)
train_x.head()

### 4. Hyperparameter tuning using 'optuna'

In [None]:
def objectiveXGB(trial, X, y):
    
    params = {'n_estimators':1000,
              # trail.suggest_unifrom() allows to pick out any value between the given range, values will be continuous and
              # not just integers.
              'learning_rate':trial.suggest_uniform('learning_rate', 0.005, 0.01),
              
              # trial.suggest_categorical() allows only the passed categorical values to be suggested.
              'subsample':trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
              
              # trial.suggest_int() will suggest integer values within the integer range. 
              'max_depth':trial.suggest_int('max_depth', 3, 11),
              
              'colsample_bylevel':trial.suggest_categorical('colsample_bylevel', [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
              
              # trail.suggest_loguniform() is used when the range of values have different scales.
              'reg_lambda':trial.suggest_loguniform('reg_lambda', 1e-3, 100),
              'reg_alpha':trial.suggest_loguniform('reg_alpha', 1e-3, 100),
              'n_jobs':-1,
              'tree_method': 'gpu_hist',
              'predictor': 'gpu_predictor'
             }
    
    model = XGBRegressor(**params)
    
    split = KFold(n_splits=K_FOLD_SPLIT)
    train_scores = []
    test_scores = []
    for train_idx, val_idx in split.split(X):
        X_tr = X.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_tr = y.iloc[train_idx]
        y_val = y.iloc[val_idx]
        
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                  eval_metric=['rmse'],
                  early_stopping_rounds=30, verbose=0,
                  # optuna allows us to pass pruning callback to xgboost callbacks, so any trial which does not seem to be 
                  # better or not qualify a given threshold of loss reduciton after some iterations will get pruned, that is
                  # stopped in between hence saving time, we will see it in action below.
                  callbacks=[optuna.integration.XGBoostPruningCallback(trial, observation_key="validation_0-rmse")]
                 )
    
        train_score = np.round(np.sqrt(mean_squared_error(y_tr, model.predict(X_tr))), 4)
        test_score = np.round(np.sqrt(mean_squared_error(y_val, model.predict(X_val))), 4)
        train_scores.append(train_score)
        test_scores.append(test_score)
        
    
    print(f'train score : {train_scores}')
    print(f'test score : {test_scores}')
    train_score = np.round(np.mean(train_scores), 4)
    test_score = np.round(np.mean(test_scores), 4)
    
    print(f'TRAIN RMSE : {train_score} || TEST RMSE : {test_score}')
    
    # you can make this function as bespoke as possible... you can return any kind of modified value using the return function
    # optuna will try to optimize it!!
    
    return test_score

In [None]:
# direction : score 값을 최대 또는 최소로 하는 방향으로 지정 
study = optuna.create_study(direction='minimize')

# n_trials : 시도 횟수 (미 입력시 Key interrupt가 있을 때까지 무한 반복)
study.optimize(lambda trial : objectiveXGB(trial, train_x,  train_y), n_trials=STUDY_TRIAL)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [None]:
trial = study.best_trial
trial_params = trial.params
print(f'Best Trial : score {trial.value}, \nparams {trial_params}')
XGBR_model = XGBRegressor(**trial_params, tree_method = 'gpu_hist', predictor = 'gpu_predictor')
XGBR_model.fit(train_x,train_y)
joblib.dump(XGBR_model,'./XGBR_model.pkl')

### 5. Load save pkl file and submission

In [None]:
pc = joblib.load('./pca_model.pkl')
xgbr_model = joblib.load('./XGBR_model.pkl')
rc = joblib.load('./robust_model.pkl')
col = pd.read_csv('./column.csv')

In [None]:
env=ubiquant.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    test_x = test_df.loc[:,list(col.iloc[:,1])]
    test_x = pc.transform(test_x)
    test_x = rc.transform(test_x)
    test_x = pd.DataFrame(data=test_x)
    sample_prediction_df['target'] = xgbr_model.predict(test_x)
    env.predict(sample_prediction_df)