<a href="https://colab.research.google.com/github/shin0105/4YP/blob/master/experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install GPyOpt
!pip install GPy

In [2]:
#import packages

#basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d

#Bayesian optimization packages
import GPy
import GPyOpt

#ML Algorithm packages
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import preprocessing

In [3]:
#dataset = UCI Car Evaluation
names=['buying','maint','doors','persons','lug_boot','safety','class']
df = pd.read_csv('car.data', names = names)
df.head()

#Convert categorical labels to numbers
le = preprocessing.LabelEncoder()

for col in names: 
  df[col] = le.fit_transform(df[col])

In [4]:
X = df.drop(['class'], axis=1)
y = df['class']

#splitting train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [5]:
#K-fold validation
def kfold_score(clf,X,y,n_fold):
    X,y = X.values,y.values
    kfold = KFold(n_splits=n_fold, shuffle=True, random_state=1)
    accuracy_list = []

    for train_index, test_index in kfold.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        preds = clf.predict(x_test_fold)
        accuracy_test = accuracy_score(preds,y_test_fold)
        accuracy_list.append(accuracy_test)
    return np.array(accuracy_list).mean()

In [53]:
#Random Forest classification

#max_features: 1 to around sqrt(# of features of dataset)
bounds = [
        {'name': 'max_features', 'type': 'continuous', 'domain': (1, 5)},
        {'name': 'n_estimators', 'type': 'continuous', 'domain': (1, 200)}
      ]

space = GPyOpt.Design_space(space = bounds) 

rf1_max_features=[]
rf1_n_estimators=[]
rf1_score=[]

def rf_score(parameters):
  params = {
        'max_features':int(parameters[0][0]),
        'n_estimators':int(parameters[0][1])
    }
  
  clf = RandomForestClassifier(max_features=params['max_features'],n_estimators=params['n_estimators'])
  clf.fit(X_train,y_train)
  score = kfold_score(clf,X_train, y_train,10)

  #storing results
  rf1_max_features.append(params['max_features'])
  rf1_n_estimators.append(params['n_estimators'])
  rf1_score.append(score)
  return score

In [54]:
#RF Classification - Matern52 kernel
kernel = GPy.kern.Matern52(input_dim=1, variance=1.0, lengthscale=1.0)
optimizer = GPyOpt.methods.BayesianOptimization(f=rf_score, 
                                                domain=bounds,
                                                acquisition_type ='EI',
                                                initial_design_numdata = 5,
                                                model_type='GP',
                                                maximize=True,
                                                kernel=kernel
                                                )

optimizer.run_optimization(max_iter=20)

#maximum accuracy score
acc_max=optimizer.fx_opt
#best parameters
params_best=optimizer.x_opt
print(acc_max)
print(params_best)

-0.9834710743801652
[ 5.         97.36508418]


In [60]:
bounds1= space.get_bounds()
X1 = np.linspace(bounds1[0][0], bounds1[0][1], 200)
X2 = np.linspace(bounds1[1][0], bounds1[1][1], 200)
x1, x2 = np.meshgrid(X1, X2)
X = np.hstack((x1.reshape(200*200,1),x2.reshape(200*200,1)))

In [64]:
m, v = optimizer.model.predict(X)

AttributeError: ignored

In [83]:
X=(optimizer.X)
print(X)
m, v = optimizer.model.predict(X)

[[  1.66181612  20.62368551]
 [  1.99669116   5.80426655]
 [  4.88728102  16.36337444]
 [  2.70349033 118.02055098]
 [  4.02390616 137.95201187]
 [  4.3990618  126.33000533]
 [  3.15829129   7.3861127 ]
 [  4.62087726  19.5407243 ]
 [  4.23941994  39.16014738]
 [  3.76936496 135.32705523]
 [  1.          96.77453376]
 [  4.40058699  16.553209  ]
 [  3.94426345  43.69158861]
 [  5.         178.20595912]
 [  5.          90.65520039]
 [  5.         157.39729367]
 [  5.          56.92101166]
 [  5.          97.36508418]
 [  5.         157.91517826]
 [  5.          80.39790375]
 [  5.          59.69559484]
 [  5.         176.61290655]
 [  5.          79.76731875]
 [  5.          54.69531692]
 [  5.          20.217961  ]]


In [88]:
optimizer.acquisition._compute_acq(X)

array([[5.18401043e-35],
       [4.31483135e-70],
       [3.28102407e-02],
       [3.33732723e-03],
       [2.39204694e-02],
       [2.40615063e-02],
       [1.63459459e-09],
       [9.25354382e-03],
       [2.79617058e-02],
       [2.40809274e-02],
       [9.35395628e-18],
       [2.39831059e-02],
       [2.17587350e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02],
       [6.39811494e-02]])

In [None]:
mll=-0.5*(optimizer.Y - m).T*()

In [95]:
X1=X[0]
X1 = X1.reshape(len(X1),1)
print(X1)

[[ 1.66181612]
 [20.62368551]]


In [98]:
kernel.K(X)

array([[2.04177090e+00, 1.52444227e+00, 1.76378284e-04, 2.92027685e-01,
        3.98841285e-03, 1.04810433e-03, 7.36907527e-02, 4.68884332e-04,
        1.85833095e-03, 9.67204873e-03, 7.91028095e-01, 1.04235798e-03,
        5.27291469e-03, 1.16226029e-04, 1.16226029e-04, 1.16226029e-04,
        1.16226029e-04, 1.16226029e-04, 1.16226029e-04, 1.16226029e-04,
        1.16226029e-04, 1.16226029e-04, 1.16226029e-04, 1.16226029e-04,
        1.16226029e-04],
       [1.52444227e+00, 2.04177090e+00, 6.01641378e-04, 7.10324236e-01,
        1.27387410e-02, 3.46093179e-03, 2.06250900e-01, 1.57325713e-03,
        6.05663192e-03, 3.00180918e-02, 3.31564521e-01, 3.44236355e-03,
        1.67012803e-02, 3.98934599e-04, 3.98934599e-04, 3.98934599e-04,
        3.98934599e-04, 3.98934599e-04, 3.98934599e-04, 3.98934599e-04,
        3.98934599e-04, 3.98934599e-04, 3.98934599e-04, 3.98934599e-04,
        3.98934599e-04],
       [1.76378284e-04, 6.01641378e-04, 2.04177090e+00, 7.43297665e-03,
        4.7694

In [124]:
log_mll = -0.5*np.matmul(np.matmul((optimizer.Y - m).T,np.linalg.inv(kernel.K(X)+np.identity(25))),(optimizer.Y - m)) \
          -0.5 * np.log(np.linalg.det(kernel.K(X)+np.identity(25))) - np.pi * 0.5 * np.log(2*np.pi)