## Importing the relevant packages

In [1]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2
from sklearn.svm import SVC,SVR
import os
import sys
from MFTreeSearchCV.MFTreeSearchCV import *
from mf.mf_func import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

## Just listing the contents of the main code directory

In [2]:
%ls MFTreeSearchCV/

converters.py  MFHOO.py           MFTreeSearchCV.py
__init__.py    MFTreeFunction.py  [0m[01;34m__pycache__[0m/


## Fetching some common data-sets 
- the news group dataset will be used in this example

In [3]:
from sklearn.datasets import load_digits,load_boston,fetch_20newsgroups
data = load_boston()

In [4]:
newsgroups_train = fetch_20newsgroups(subset='all')
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(newsgroups_train.data)
labels = newsgroups_train.target
#features =features.todense()

## Creating the features X and the target y
- Note that there are 15076 samples in the train set
- This will be the number of samples to be used a the highest fidelity in the next cell

In [5]:
X = features
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape

(15076, 173762)

## Setting up the estimator and parameters
- As a small example we will simple tune logistic regression
- param_dict spcifies that we will be tuning 'C' and 'penalty'
- 'C' is a real valued parameter to be tuned in the range [1e-5,1e5] and the searching is done in the log scale
- 'penalty' is of course a categorical parameter
- n_jobs is the number of threads used during CV
- cv = 3 implies 3-fold cross-validation
- scoring is set as 'accuracy'
- fidelity range is [500,15076], that is at the highest level we caan traain on the whole data-set while at the cheapest level we can train using only 500 samples chosen at random

In [6]:
estimator = LogisticRegression() #base estimator
param_dict = {'C':{'range':[1e-5,1e5],'scale':'log','type':'real'},\
              'penalty':{'range':['l1','l2'],'scale':'linear','type':'cat'}} #parameter space
fidelity_range = [500,15076] # fidelity range, lowest fidelity uses 500 samples while the highest one uses 
#the whole dataset  
n_jobs = 3 # number of jobs
cv = 3 # cv level
fixed_params = {}
scoring = 'accuracy'

## Budget
- We set the total budget as 100 secs
- This is may be only 3-4 times the budget required to do one single training and CV using the whole data-set

In [7]:
t1 = time.time()
estimator = estimator.fit(X_train,y_train)
t2 = time.time()
total_budget = 100 # total budget in seconds
print('Time without CV: ', t2 - t1)



Time without CV:  11.246359825134277


## Creating an instance of the class

In [8]:
model = MFTreeSearchCV(estimator=estimator,param_dict=param_dict,scoring=scoring,\
                      fidelity_range=fidelity_range,unit_cost=None,\
                    cv=cv,  n_jobs = n_jobs,total_budget=total_budget,debug = True,fixed_params=fixed_params)

## running in debug mode will display certain outputs

## Fitting the model or choosing the best parameter
- Note that refit = true, which means at the end the training is done at the best parameter

In [9]:
m = model.fit(X_train,y_train)

Setting unit cost automatically as None was supplied
Unit Cost:  12.181065082550049
Auto Init: 
C: 0.20333240858400015
nu: 0.20333240858400015
Budget Remaining: 83.1957745203057
Number of MFHOO Instances: 3
Budget per MFHOO Instance:15.550859757551853
Running SOO number: 1 rho: 0.95 nu: 0.20333240858400015
Done!
Running SOO number: 2 rho: 0.9259454627568515 nu: 0.20333240858400015
Done!
Updating C
C: 0.28466537201760017
nu_max: 0.28466537201760017
Running SOO number: 3 rho: 0.8573749999999999 nu: 0.28466537201760017
Done!
Updating C
C: 0.3985315208246402
nu_max: 0.3985315208246402




## Predicting using the best model, then scoring it and then displaying the best_params_

In [24]:
y_pred = m.predict(X_test)

In [25]:
accuracy_score(y_pred,y_test)

0.926525198938992

In [26]:
m.best_params_

{'C': 316.2277660168377, 'penalty': 'l2'}

In [27]:
m.cv_results_

Unnamed: 0,params,score
0,"{'C': 316.2277660168377, 'penalty': 'l2'}",0.94641
1,"{'C': 316.2277660168377, 'penalty': 'l2'}",0.948593
