In [0]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn import preprocessing
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import time
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

**Installing hpsklearn**

In [0]:
pip install gitpython

In [0]:
import os
from git.repo.base import Repo
Repo.clone_from("https://github.com/hyperopt/hyperopt-sklearn", "hyperopt/hyperopt-sklearn.git")

In [0]:
cd hyperopt/hyperopt-sklearn.git/

In [0]:
pip install -e .

In [0]:
from hpsklearn import HyperoptEstimator, svc_rbf, normalizer, standard_scaler,any_preprocessing, random_forest, extra_trees, xgboost_classification
import hpsklearn

In [0]:
pd.show_versions()

**Importing Data and Pre-Processing**

In [0]:
data_git = 'https://raw.githubusercontent.com/rohandongare-nci/18120199-Data/master/Base%20Learner%20Optimization/Base%20Learners%20Optimization.csv'
sdss_1 = pd.read_csv(data_git)

In [0]:
sdss_opt=sdss_1

In [0]:
unwanted_columns = ['camcol','run','rerun','objid','specobjid']
sdss_opt.drop(unwanted_columns, axis=1, inplace=True)
sdss_opt.head(10)

In [0]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
enc = LabelEncoder()
X_opt = sdss_opt[['ra','dec','u','g','r','i','z','redshift','plate','mjd','fiberid','field']]
y_opt = enc.fit_transform(sdss_opt['class'])
X_opt.head()

In [0]:
transform_opt = preprocessing.PowerTransformer(method='yeo-johnson', standardize=True)

In [0]:
X_opt = preprocessing.normalize(X_opt, norm='l2')

In [0]:
X_opt = transform_opt.fit_transform(X_opt)

In [0]:
#Random Undersampling the majority galaxy class to minority class
under_sampler_opt = RandomUnderSampler(random_state=18120199,replacement=False)
X_undersam, y_undersam = under_sampler_opt.fit_resample(X_opt, y_opt)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X_undersam, y_undersam, test_size = 0.2, random_state = 18120199)

**SVM Optimization**

In [0]:
#Defining hyper-parameter space for SVM
opt_svm_para = {
    'C': hp.uniform('C', range(0,10)),
    'gamma': hp.quniform('gamma', np.arange(0.0, 1.0,0.2))
    'decision_function_shape': hp.choice('decision_function_shape',['ovr','ovo'])
}


In [0]:
#Defining the model to be tuned
svm_opt_min = HyperoptEstimator(classifier=svc_rbf('opt_svm_para'),
                          preprocessing=[],
                          algo=tpe.suggest,
                          max_evals=30
                          )

In [0]:
#Running bayesian optimization for tuning SVM
start_svm_opt=time.time()
svm_opt_min.fit(X_train,Y_train)
end_svm_opt=time.time()

In [0]:
#Model performance with the best performing hyper-parameters
print(svm_opt_min.score(X_val, Y_val))

In [0]:
#Best performing hyper-parameters
print(svm_opt_min.best_model())

In [0]:
svm_time=end_svm-start_svm
print(svm_time)

**QDA Optimization**

In [0]:
from hpsklearn import quadratic_discriminant_analysis

In [0]:
#Defining hyper-parameter space for QDA
qda_optim_para = {
    'tol': hp.lognormal('tol', 0, 1),
    'reg_param' : hp.lognormal('reg_param',0.0,1)
}

In [0]:
#Defining the model to be tuned
qda_opt_min = HyperoptEstimator(classifier=quadratic_discriminant_analysis('qda_optim_para'),
                          preprocessing=[],
                          algo=tpe.suggest,
                          max_evals=30)

In [0]:
#Tuning QDA
start_qda=time.time()
qda_opt_min.fit(X_train,Y_train)
end_qda=time.time()

In [0]:
#Model performance with the best performing hyper-parameters
print(qda_opt_min.score(X_val, Y_val))

In [0]:
#Best performing hyper-parameters
print(qda_opt_min.best_model())

In [0]:
#Time taken by QDA
qda_time=end_qda-start_qda
print(qda_time)

**Random Forest Optimization**

In [0]:
#Defining search space for random forests
randomforest_optim_para = {
    'min_samples_split':hp.choice('min_samples_split',range(1,10)),
    'max_depth': hp.choice('max_depth', range(1,200)),
    'max_features': hp.choice('max_features',["auto","sqrt","log2"]),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(1,10)),
    'n_estimators': hp.choice('n_estimators', range(1,1000)),
    'criterion': hp.choice('criterion', ["gini", "entropy"])
}

In [0]:
#Defining the model to be tuned
rf_opt_min = HyperoptEstimator(classifier=random_forest('randomforest_optim_para'),
                          preprocessing=[],
                          algo=tpe.suggest,
                          max_evals=30)

In [0]:
#Tuning random forests
rf_start=time.time()
rf_opt_min.fit(X_train,Y_train)
rf_end=time.time()

In [0]:
#Model performance with the best performing hyper-parameters
print(rf_opt_min.score(X_val, Y_val))

In [0]:
#Best performing hyper-parameters
print(rf_opt_min.best_model())

In [0]:
time_rf=rf_end-rf_start
print(time_rf)

**Extratrees Optimization**

In [0]:
#Extra trees search space
extratree_optim_para = {
    'min_samples_split':hp.choice('min_samples_split',range(1,15)),
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features',["auto","sqrt","log2"]),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(1,10)),
    'n_estimators': hp.uniform('n_estimators', range(100,1000,50)),
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'bootstrap':True
}

In [0]:
#Defining the model to be tuned
et_opt_min = HyperoptEstimator(classifier=extra_trees('extratree_optim_para'),
                          preprocessing=[],
                          algo=tpe.suggest,
                          max_evals=30)

In [0]:
#Tuning Extra trees
et_start=time.time()
et_opt_min.fit(X_train,Y_train)
et_end=time.time()

In [0]:
#Model performance with the best performing hyper-parameters
print(et_opt_min.score(X_val, Y_val))

In [0]:
#Best performing hyper-parameters
print(et_opt_min.best_model())

In [0]:
time_et=et_end-et_start
print(time_et)

**XGBoost Optimization**

In [0]:
#Defining search space for XGBoost
import numpy as np
xgb_optim_para = {
        'n_estimators': hp.choice('n_estimators', np.arange(0,500, dtype=int)),
        'learning_rate': hp.quniform('learning_rate', np.arange(0.05,0.4,0.05)),#0.05-0.2
        'max_depth':  hp.choice('max_depth', np.arange(1, 15)),
        'min_child_weight': hp.choice('min_child_weight', np.arange(1, 10, 2)),
        'subsample': hp.choice('subsample', np.arange(0.2, 0.9, 0.05)),
        'gamma': hp.quniform('gamma', 0.001, 0.01,0.005),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.3, 0.9, 0.07),
        'reg_alpha':hp.quniform('reg_alpha',0.001,0.1)
    }

In [0]:
#Defining the model to be tuned
xgb_opt_min = HyperoptEstimator(classifier=xgboost_classification('xgb_optim_para'),
                          preprocessing=[],
                          algo=tpe.suggest,
                          max_evals=20)

In [0]:
#Fitting the Bayesian optimization model defined above
xgb_start=time.time()
xgb_opt_min.fit(X_train,Y_train)
xgb_end=time.time()

In [0]:
#Model performance with the best performing hyper-parameters
print(xgb_opt_min.score(X_val, Y_val))

In [0]:
#Best performing hyper-parameters
print(xgb_opt_min.best_model())

In [0]:
time_xgb=xgb_end-xgb_start
print(time_xgb)