# Read

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')

# Scailing
scaler=MinMaxScaler()
scaler.fit(train[['fixed acidity']])
train['Scaled fixed acidity']=scaler.transform(train[['fixed acidity']])
test['Scaled fixed acidity']=scaler.transform(test[['fixed acidity']])

#Encoding
encoder=OneHotEncoder()
encoder.fit(train[['type']])
onehot=encoder.transform(train[['type']])
onehot=onehot.toarray()
onehot=pd.DataFrame(onehot)
onehot.columns=encoder.get_feature_names()
train=pd.concat([train,onehot],axis=1)
train=train.drop(columns=['type'])

onehot=encoder.transform(test[['type']])
onehot=onehot.toarray()
onehot=pd.DataFrame(onehot)
onehot.columns=encoder.get_feature_names()
test=pd.concat([test,onehot],axis=1)
test=test.drop(columns=['type'])

test.head()

Unnamed: 0,index,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Scaled fixed acidity,x0_red,x0_white
0,0,9.0,0.31,0.48,6.6,0.043,11.0,73.0,0.9938,2.9,0.38,11.6,0.429752,0.0,1.0
1,1,13.3,0.43,0.58,1.9,0.07,15.0,40.0,1.0004,3.06,0.49,9.0,0.785124,1.0,0.0
2,2,6.5,0.28,0.27,5.2,0.04,44.0,179.0,0.9948,3.19,0.69,9.4,0.22314,0.0,1.0
3,3,7.2,0.15,0.39,1.8,0.043,21.0,159.0,0.9948,3.52,0.47,10.0,0.280992,0.0,1.0
4,4,6.8,0.26,0.26,2.0,0.019,23.5,72.0,0.99041,3.16,0.47,11.8,0.247934,0.0,1.0


# Bayesian Optimization

In [7]:
from bayes_opt import BayesianOptimization

In [8]:
X=train.drop(columns=['index','quality'])
y=train['quality']

In [9]:
rf_parameter_bounds={
    'max_depth':(1,3),
    'n_estimators':(30,100),
    }

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def rf_bo(max_depth,n_estimators):
    rf_params={
              'max_depth':int(round(max_depth)),
              'n_estimators':int(round(n_estimators)),
              }
    rf=RandomForestClassifier(**rf_params)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2, )
    
    rf.fit(X_train,y_train)
    score=accuracy_score(y_valid,rf.predict(X_valid))
    return score

In [18]:
B0_rf=BayesianOptimization(f=rf_bo,pbounds=rf_parameter_bounds,random_state=0)

In [19]:
B0_rf.maximize(init_points=5,n_iter=5)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.5336  [0m | [0m 2.098   [0m | [0m 80.06   [0m |
| [0m 2       [0m | [0m 0.4909  [0m | [0m 2.206   [0m | [0m 68.14   [0m |
| [0m 3       [0m | [0m 0.5255  [0m | [0m 1.847   [0m | [0m 75.21   [0m |
| [0m 4       [0m | [0m 0.5136  [0m | [0m 1.875   [0m | [0m 92.42   [0m |
| [95m 5       [0m | [95m 0.5345  [0m | [95m 2.927   [0m | [95m 56.84   [0m |
| [0m 6       [0m | [0m 0.5218  [0m | [0m 2.851   [0m | [0m 52.82   [0m |
| [0m 7       [0m | [0m 0.4591  [0m | [0m 1.012   [0m | [0m 84.05   [0m |
| [0m 8       [0m | [0m 0.5264  [0m | [0m 2.663   [0m | [0m 74.79   [0m |
| [95m 9       [0m | [95m 0.5555  [0m | [95m 3.0     [0m | [95m 77.88   [0m |
| [0m 10      [0m | [0m 0.4591  [0m | [0m 1.0     [0m | [0m 60.52   [0m |


In [20]:
max_params=B0_rf.max['params']

max_params['max_depth']=int(max_params['max_depth'])
max_params['n_estimators']=int(max_params['n_estimators'])
print(max_params)

{'max_depth': 3, 'n_estimators': 77}


In [21]:
B0_tuned_rf=RandomForestClassifier(**max_params)