# Import packages and data sets

In [None]:
!pip install lightgbm
!pip install bayesian-optimization

In [None]:
import glob
import pandas as pd
#from math import cos, sin
import numpy as np
import lightgbm as lgbm
import seaborn as sns
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
import os
import sys


raw_dir = "/home/ec2-user/pwp-summer-2019/master_thesis_nhh_2019/raw_data/" 
data_dir = "/home/ec2-user/pwp-summer-2019/master_thesis_nhh_2019/processed_data/" 

pd.set_option('display.max_columns', 999)

In [None]:
from Functions import (feature_engineering)

In [None]:
import joblib

df_train = pd.read_pickle(data_dir+'df_train')
df_val = pd.read_pickle(data_dir+'df_val')
df_test = pd.read_pickle(data_dir+'df_test')

formation_dictionary = joblib.load(data_dir+'formation_dictionary.pkl')

df_train_val = df_train.append(df_val)

# Feature engineering and remove outliers

In [None]:
params_features = {
    'outlier_values': {'gr': df_train_val.gr.quantile(0.9995),
                       'rmed': df_train_val.rmed.quantile(0.9995),
                       'rdep': df_train_val.rdep.quantile(0.9995)
                      },
    'above_below_variables': ['gr','rdep','rmed'], #,'dt','nphi','rhob'],
    'y_variable': 'formation_2',
    'num_shifts': 1,
    'cols_to_remove' : ['depth', 'dts','hgr', 'hnphi', 
                        'hrdep', 'hrhob', 'hrmed', 'hrsh','rsh','field','main_area','md'],
    'thresh': 7,
    'var1_ratio': 'gr'
}

### For home-made stratified split

In [None]:
train_class = feature_engineering(df_train,**params_features)

train_class.remove_outliers()
train_class.above_below()
train_class.cleaning()
train_class.xyz()

df_train = train_class.df
columns_class = df_train.columns

val_class = feature_engineering(df_val,**params_features)

val_class.remove_outliers()
val_class.above_below()
val_class.cleaning()
val_class.xyz()
df_val = val_class.df[columns_class]

### For sklearn(randomized) stratified split

In [None]:
df_class = feature_engineering(df_train_val,**params_features)

df_class.remove_outliers()
df_class.above_below()
df_class.cleaning()
df_class.xyz()

df = df_class.df

# Split into train_valid/test

### For home-made stratified split

In [None]:
col = ['formation','title','formation_2','group'] 
X_train = df_train.drop(col, axis=1)
Y_train = df_train['formation_2']

X_valid = df_val.drop(col, axis=1)
Y_valid = df_val['formation_2']

features_list = X_train.columns

In [None]:
X_train_valid = X_train.append(X_valid)
Y_train_valid = Y_train.append(Y_valid)

### For sklearn(randomized) stratified split

In [None]:
col = ['formation','title','formation_2','group'] #,'depth','group'
X = df.drop(col, axis=1)
y = df['formation_2']

In [None]:
X_train_and_valid, X_test, Y_train_and_valid, Y_test = train_test_split( X, y, 
                                                                        test_size=0.10, 
                                                                        random_state=42, 
                                                                        stratify=y)

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_and_valid, Y_train_and_valid, 
                                                      test_size=0.33, 
                                                      random_state=42, 
                                                      stratify=Y_train_and_valid)


# Bayesian

In [None]:
import pandas as pd;
import numpy as np;
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

def lgb_evaluate(                
                learningRate,
                nEstimators,
                maxDepth,
                numLeaves,
                featureFraction,
                minDataInLeaf
                ):
    
    clf = lgb.LGBMClassifier(
        n_estimators= int(nEstimators),
        num_leaves= int(numLeaves),
        max_depth= int(maxDepth),
        verbose =-1,
        learning_rate=float(learningRate),
        feature_fraction=float(featureFraction),
        min_data_in_leaf=int(minDataInLeaf),
        objective = 'multiclass',
        metric= 'multi_logloss',
        eval_metric= 'multi_logloss')    
    
    scores = cross_val_score(clf, X_train_valid, Y_train_valid, cv=5, scoring='f1_micro')
    print(np.mean(scores))

    return np.mean(scores)
   
def bayesOpt(train_x, train_y):
    lgbBO = BayesianOptimization(lgb_evaluate, {
                                                'learningRate' : (.05,.5),
                                                'nEstimators':(10,150),
                                                'numLeaves':  (5, 250),
                                                'maxDepth': (2, 90),
                                                'featureFraction':(.50,1),
                                                'minDataInLeaf':(100,1000)})


    lgbBO.maximize(init_points=4, n_iter=8)

    
    return lgbBO
bayes_result=bayesOpt(X_train, Y_train)

In [None]:
results=pd.DataFrame(bayes_result.res)

results['learningRate']=results['params'].apply(lambda x: x['maxDepth'])
results.plot.scatter(x='learningRate',y='target')
bayes_result.max


In [None]:
bayes_result.set_bounds(new_bounds={"nEstimators": (30, 100)})

bayes_result.maximize(
    init_points=15,
    n_iter=30)