# Bagging

In [1]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import graphviz # pip install 



In [2]:
DATASET_DIR = os.path.join(os.path.dirname(os.getcwd()), 'datasets')

In [3]:
red_file = os.path.join(DATASET_DIR, 'winequality-red.csv')
white_file = os.path.join(DATASET_DIR, 'winequality-white.csv')

In [4]:
wine_red = pd.read_csv(red_file, sep=';')
wine_white = pd.read_csv(white_file, sep=';')

In [5]:
wine_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
wine_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [7]:
len(wine_red), len(wine_white)

(1599, 4898)

In [8]:
wine_red.head()

wine_white.head()

len(wine_red), len(wine_white)

(1599, 4898)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

### 1. Feature_n: 2(volatile acidity, sulphates), Binary Problem

In [10]:
new_red = wine_red.copy()
new_red['is_red'] = 1
new_white = wine_white.copy()
new_white['is_red'] = 0

wine = new_red.append(new_white, ignore_index=True)


In [11]:
wine.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1


In [12]:
wine.tail(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
6495,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0
6496,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6,0


In [15]:
X = wine.drop('quality',axis=1)
y = wine['quality']

##### split train&test

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=wine['is_red'], random_state=42)

In [63]:
from sklearn.ensemble import RandomForestClassifier
"""
RandomForestClassifier(n_estimators=’warn’, criterion=’gini’, max_depth=None, 
                       min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                        min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, 
                        random_state=None, verbose=0, warm_start=False, class_weight=None)
                        
# n_estimators : integer, optional (default=10)
- The number of trees in the forest.
# 예측기의 개수=만들어지는 tree의 개수

# criterion : string, optional (default=”gini”)
- The function to measure the quality of a split. 
- Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 
Note: this parameter is tree-specific

# max-depth: 
# max_features: 
# max_leaf_nodes:
# ... (decision Tree 상속)

# bootstrap : boolean, optional (default=True)
        Whether bootstrap samples are used when building trees. If False, the
        whole datset is used to build each tree.
        
# oob_score : bool (default=False)
        Whether to use out-of-bag samples to estimate
        the generalization accuracy.
"""
clf =  RandomForestClassifier(10, criterion='entropy', random_state=0)
clf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [64]:
clf.score(train_X, train_y)

0.9874794745484401

In [65]:
clf.score(test_X, test_y)
# max_features 가 있어야 score가 높아진다.

0.656

##### attribute

In [66]:
# list of DecisionTreeClassifier
clf.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=209652396, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=398764591, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
            

In [67]:
clf.predict_proba(test_X)

array([[0. , 0. , 0.4, 0.5, 0.1, 0. ],
       [0.2, 0. , 0.5, 0.2, 0. , 0.1],
       [0. , 0. , 0.5, 0.3, 0.2, 0. ],
       ...,
       [0. , 0. , 0.1, 0.3, 0.4, 0.2],
       [0. , 0.1, 0.9, 0. , 0. , 0. ],
       [0. , 0. , 0. , 1. , 0. , 0. ]])

In [68]:
clf.classes_

array([3, 4, 5, 6, 7, 8], dtype=int64)

In [69]:
# The number of features when fit is performed.
clf.n_features_

12

In [70]:
clf.feature_importances_
# 다 더하면 1이다.

array([0.07111784, 0.09794398, 0.0796715 , 0.08526031, 0.08706186,
       0.09345741, 0.08783301, 0.10536196, 0.07875339, 0.08203306,
       0.12765338, 0.0038523 ])

In [71]:
# Score of the training dataset obtained using an out-of-bag estimate.
clf.oob_score

False

In [72]:
clf.n_features_

12

In [73]:
clf.max_features

'auto'

In [74]:
clf.predict(test_X)

array([6, 5, 5, ..., 7, 5, 6], dtype=int64)

In [75]:
clf.predict_proba(test_X)

array([[0. , 0. , 0.4, 0.5, 0.1, 0. ],
       [0.2, 0. , 0.5, 0.2, 0. , 0.1],
       [0. , 0. , 0.5, 0.3, 0.2, 0. ],
       ...,
       [0. , 0. , 0.1, 0.3, 0.4, 0.2],
       [0. , 0.1, 0.9, 0. , 0. , 0. ],
       [0. , 0. , 0. , 1. , 0. , 0. ]])

In [76]:
clf.score(train_X, train_y)

0.9874794745484401

In [77]:
clf.score(test_X, test_y)

0.656

# Quality

In [78]:
new_red = wine_red.copy()
new_white = wine_white.copy()


In [79]:
X = new_red.drop('quality', axis=1)
y = new_red['quality']

In [80]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y, random_state=42, test_size=.2)


In [81]:
train_y.head()

1542    6
1558    5
344     6
924     5
971     6
Name: quality, dtype: int64

In [82]:
clf =  RandomForestClassifier(5000, criterion='entropy', random_state=0, max_features='sqrt')
clf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [83]:
clf.score(train_X, train_y)

1.0

In [84]:
clf.score(test_X, test_y)

0.6875

# Grid Search

In [85]:
"""
GridSearchCV(estimator, param_grid, scoring=None, n_jobs=None, iid=’warn’, refit=True, cv=’warn’, 
             verbose=0, pre_dispatch=‘2*n_jobs’, error_score=’raise-deprecating’, return_train_score=False)

Exhaustive search over specified parameter values for an estimator.

Important members are fit, predict.

GridSearchCV implements a “fit” and a “score” method. It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.

The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid.


"""

""

''

In [97]:
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [98]:
# feature_len/3
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [200, 500, 800, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [10, 12, 14, 16, 18, 20],
    'criterion' :['gini', 'entropy']
}
CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
CV_clf.fit(train_X, train_y)

# 모든 grid에 대하여 다 실행

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 500, 800, 1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 12, 14, 16, 18, 20], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [101]:
CV_clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [200, 400, 600, 800],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [20, 25, 30, 40],
    'criterion' :['gini', 'entropy']
}
CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
CV_clf.fit(train_X, train_y)

# 모든 grid에 대하여 다 실행