In [37]:
##Validation set

#Until now, we use test_set for evaluate model. 
#However, if you repeatedly use test_set and find the optimal model, there is a disadvantage that a model that fits test_set is created.
#So from now, we use validation set. We can separate train set fot make this like test_set.
#Example
    #Separate 20% from train_set for test_set and again 20% for validation set. 
    #Find optimal hyperparameter use train_set(60%), validation set(20%)
    #After find, give the final score as train_set + validation set(80%)  and test_set(20%)

    

import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')

data = wine[['alcohol', 'sugar', 'pH']].to_numpy()  #2-dimension array
target = wine['class'].to_numpy()                  #linear array.

from sklearn.model_selection import train_test_split
#separate test_data from whole (20% for test, 80% for train data)
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size = 0.2, random_state = 42)

#separate validation_data from remain train_data (20% of train data(it is 80% of all data))
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size = 0.2, random_state = 42)

print(sub_input.shape, val_input.shape)

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = 42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))
#As we can see, It is overfitting to train_set. We have to change parameter for find better model.




##Cross validation
#devide train set into k folds
#Each fold is used validation data in turn. Remain k-1 folds are used train_set.
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
print(scores)
#cross_validate() method return dictionary form(key-value form), this method default do 5-cross validation, we can change k value using cv parameter
    #fit_time : time taken to train model
    #score_time : time taken to scoring
    #test_score : validation_score for each fold  (scored by validation data)
    #train_score : train score for each fold      (scored by train data) (it is printed when return_train_score=True)
#Final score is average of test_score (It's name is test_score but don't forget, it is score of validation fold)
import numpy as np
print('final score of model : ', np.mean(scores['test_score']))

#this block is same with upper block --> 5-cross validation
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt, train_input, train_target, cv = StratifiedKFold())
print(np.mean(scores['test_score']))

#10-cross validation
splitter = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
scores = cross_validate(dt, train_input, train_target, cv = splitter)
print(np.mean(scores['test_score']))


##Grid search
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
#make dictionary. Key value is name of be serched parameter, value is range of parameter can get
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs=-1)
gs.fit(train_input, train_target)
#사이킷런의 그리드서치는 찾아낸 최적의 파라미터 조합으로 전체 훈련 세트에서 자동으로 다시 훈련을 진행한다. 그리고 그 모델을 best_estimator_에 저장한다.
dt = gs.best_estimator_
print(dt.score(train_input, train_target))
#Best hyper parameters that grid search class found is contatined in best_params_
print(gs.best_params_)
#average score of cross_validation for each hyperparameter candidates is contained in 'mean_test_score' key value of gs.cv_results_ dictionary
print(gs.cv_results_['mean_test_score'])

best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])


#use grid search on more complex parameters - find best combination of three hyperparameters
params = {
    'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
    'max_depth' : range(5, 20, 1),
    'min_samples_split' : range(2, 200, 10)
    }

gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs=-1)
gs.fit(train_input, train_target)
print(gs.best_params_)
print(np.max(gs.cv_results_['mean_test_score']))



##Random Search
from scipy.stats import uniform, randint
#uniform - 실수값  랜덤하게 뽑음
#randint - 정수값  랜덤하게 뽑음. --> C/C++의 rand()랑 동일하게 작동 하는듯?

rgen = randint(0,10)
rgen.rvs(10)

np.unique(rgen.rvs(1000), return_counts=True)

ugen = uniform(0,1)
ugen.rvs(10)


#select kind of parameters that model will find
params = {
    'min_impurity_decrease' : uniform(0.0001, 0.001),
    'max_depth' : randint(20, 50),
    'min_samples_split' : randint(2, 25),
    'min_samples_leaf' : randint(1, 25),
    }

from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)
print(gs.best_params_)
print(np.max(gs.cv_results_['mean_test_score']))
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

(4157, 3) (1040, 3)
0.9971133028626413
0.864423076923077
{'fit_time': array([0.00984097, 0.00789809, 0.00699997, 0.00680614, 0.00644493]), 'score_time': array([0.00064492, 0.00052094, 0.00043917, 0.00044298, 0.00042892]), 'test_score': array([0.87019231, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}
final score of model :  0.8554925223957948
0.8554925223957948
0.8581873425226026
0.9615162593804117
{'min_impurity_decrease': 0.0001}
[0.86800067 0.86453617 0.86492226 0.86780891 0.86761605]
{'min_impurity_decrease': 0.0001}
{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}
0.8683865773302731
{'max_depth': 39, 'min_impurity_decrease': 0.00034102546602601173, 'min_samples_leaf': 7, 'min_samples_split': 13}
0.8695428296438884
0.86
