## Random Forest 
- 여러 의사결정트리를 구성하여 이를 종합하는 앙상블 기법 
- 의사결정 나무 수십 ~ 수백개가 예측한 분류 혹은 회귀값을 평균낸 모델 
- 과정 
    - 데이터 부트스트래핑 과정을 통해 N개의 샘플링 데이터 셋 생성 
    - 샘플링 데이터 셋에서 의사결정 트리를 부석하여 그 결과를 종합하여 앙상블 모델을 만들고 오분류율 평가

### Classification

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
data = pd.read_csv("../Data/breast-cancer-wisconsin.csv")
X = data.iloc[:, 1:10]
y = data[['Class']]

from sklearn.model_selection import * 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 410)

from sklearn.preprocessing import * 
minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [4]:
import sklearn
help(sklearn)

Help on package sklearn:

NAME
    sklearn

DESCRIPTION
    Machine learning module for Python
    
    sklearn is a Python module integrating classical machine
    learning algorithms in the tightly-knit world of scientific Python
    packages (numpy, scipy, matplotlib).
    
    It aims to provide simple and efficient solutions to learning problems
    that are accessible to everybody and reusable in various contexts:
    machine-learning as a versatile tool for science and engineering.
    
    See http://scikit-learn.org for complete documentation.

PACKAGE CONTENTS
    __check_build (package)
    _build_utils (package)
    _config
    _distributor_init
    _isotonic
    _loss (package)
    _min_dependencies
    base
    calibration
    cluster (package)
    compose (package)
    conftest
    covariance (package)
    cross_decomposition (package)
    datasets (package)
    decomposition (package)
    discriminant_analysis
    dummy
    ensemble (package)
    exceptions
    experime

In [6]:
from sklearn.ensemble import * 
dir(sklearn.ensemble)

['AdaBoostClassifier',
 'AdaBoostRegressor',
 'BaggingClassifier',
 'BaggingRegressor',
 'BaseEnsemble',
 'ExtraTreesClassifier',
 'ExtraTreesRegressor',
 'GradientBoostingClassifier',
 'GradientBoostingRegressor',
 'HistGradientBoostingClassifier',
 'HistGradientBoostingRegressor',
 'IsolationForest',
 'RandomForestClassifier',
 'RandomForestRegressor',
 'RandomTreesEmbedding',
 'StackingClassifier',
 'StackingRegressor',
 'VotingClassifier',
 'VotingRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_bagging',
 '_base',
 '_forest',
 '_gb',
 '_gb_losses',
 '_gradient_boosting',
 '_hist_gradient_boosting',
 '_iforest',
 '_stacking',
 '_voting',
 '_weight_boosting']

In [7]:
model = RandomForestClassifier()
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

1.0

In [8]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9883040935672515

In [9]:
from sklearn.metrics import * 
con_train = confusion_matrix(y_train, pred_train)
print(con_train)

[[333   0]
 [  0 179]]


In [10]:
report_train = classification_report(y_train, pred_train)
print(report_train)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512



In [11]:
con_test = confusion_matrix(y_test, pred_test)
print(con_test)

[[109   2]
 [  0  60]]


In [12]:
report_test = classification_report(y_test, pred_test)
print(report_test)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       111
           1       0.97      1.00      0.98        60

    accuracy                           0.99       171
   macro avg       0.98      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171



In [13]:
help(RandomForestClassifier())

Help on RandomForestClassifier in module sklearn.ensemble._forest object:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------

In [14]:
paramGrid = {"n_estimators" : range(100, 1000, 100), 
             "max_features" : ['auto', 'sqrt', 'log2']}
gridSearch = GridSearchCV(RandomForestClassifier(), paramGrid, cv = 5)
gridSearch.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(100, 1000, 100)})

In [15]:
print("Best param : {}".format(gridSearch.best_params_))
print("Best score : {}".format(gridSearch.best_score_))
print("Test score : {}".format(gridSearch.score(X_scaled_test, y_test)))

Best param : {'max_features': 'sqrt', 'n_estimators': 100}
Best score : 0.9707214924804873
Test score : 0.9883040935672515


In [16]:
from scipy.stats import * 
paramRandom = {"n_estimators" : randint(low= 100, high = 1000),
               "max_features" : ['auto', 'sqrt', 'log2']}
randomSearch = RandomizedSearchCV(RandomForestClassifier(), paramRandom, n_iter = 100, cv = 5)
randomSearch.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   param_distributions={'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x14c667490>})

In [17]:
print("Best param : {}".format(randomSearch.best_params_))
print("Best score : {}".format(randomSearch.best_score_))
print("Test score : {}".format(randomSearch.score(X_scaled_test, y_test)))

Best param : {'max_features': 'log2', 'n_estimators': 336}
Best score : 0.972663240053303
Test score : 0.9766081871345029


### Regression

In [18]:
import pandas as pd 
data2 = pd.read_csv("../Data/house_price.csv", encoding="utf-8")
X = data2.iloc[:, 1:5]
y = data2[['house_value']]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=410)
minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [20]:
from sklearn.ensemble import * 
dir(sklearn.ensemble)

['AdaBoostClassifier',
 'AdaBoostRegressor',
 'BaggingClassifier',
 'BaggingRegressor',
 'BaseEnsemble',
 'ExtraTreesClassifier',
 'ExtraTreesRegressor',
 'GradientBoostingClassifier',
 'GradientBoostingRegressor',
 'HistGradientBoostingClassifier',
 'HistGradientBoostingRegressor',
 'IsolationForest',
 'RandomForestClassifier',
 'RandomForestRegressor',
 'RandomTreesEmbedding',
 'StackingClassifier',
 'StackingRegressor',
 'VotingClassifier',
 'VotingRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_bagging',
 '_base',
 '_forest',
 '_gb',
 '_gb_losses',
 '_gradient_boosting',
 '_hist_gradient_boosting',
 '_iforest',
 '_stacking',
 '_voting',
 '_weight_boosting']

In [21]:
model = RandomForestRegressor()
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.9392428660160834

In [22]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)
# OverFitting

0.5792173108856868

In [23]:
import numpy as np 
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)

RMSE_train = np.sqrt(MSE_train)
RMSE_test = np.sqrt(MSE_test)
print("학습 데이터 RMSE : ", RMSE_train)
print("테스트 데이터 RMSE : ", RMSE_test)

학습 데이터 RMSE :  23384.795352747453
테스트 데이터 RMSE :  63081.339892094955


In [24]:
help(RandomForestRegressor)

Help on class RandomForestRegressor in module sklearn.ensemble._forest:

class RandomForestRegressor(ForestRegressor)
 |  RandomForestRegressor(n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest regressor.
 |  
 |  A random forest is a meta estimator that fits a number of classifying
 |  decision trees on various sub-samples of the dataset and uses averaging
 |  to improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimato

In [25]:
ParamGrid = {"n_estimators" : range(100, 1000, 100), 
             "max_features" : ['auto', 'sqrt', 'log2']}
gridSearch = GridSearchCV(RandomForestRegressor(), paramGrid, cv = 5)
gridSearch.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(100, 1000, 100)})

In [26]:
print("Best param : {}".format(gridSearch.best_params_))
print("Best score : {}".format(gridSearch.best_score_))
print("Test score : {}".format(gridSearch.score(X_scaled_test, y_test)))

Best param : {'max_features': 'log2', 'n_estimators': 700}
Best score : 0.5758078512309016
Test score : 0.5895707027079429


In [27]:
from scipy.stats import * 
ParamRandom = {"n_estimators" : randint(low = 100, high = 1000)}
randomSearch = RandomizedSearchCV(RandomForestRegressor(), ParamRandom, n_iter = 10, cv = 5)
randomSearch.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x14c68dd30>})

In [29]:
print("Best Param : {}".format(randomSearch.best_params_))
print("Best score : {}".format(randomSearch.best_score_))
print("Test score : {}".format(randomSearch.score(X_scaled_test, y_test)))

Best Param : {'n_estimators': 989}
Best score : 0.5654750077850753
Test score : 0.5841947494357425
