## 디폴트 파라미터
RandomForestClassifier(
 n_estimators=100,
 criterion='gini',
 max_depth=None,
 min_samples_split=2,
 min_samples_leaf=1,
 min_weight_fraction_leaf=0.0,
 max_features='auto',
 max_leaf_nodes=None,
 min_impurity_decrease=0.0,
 min_impurity_split=None,
 bootstrap=True,
 oob_score=False,
 n_jobs=None,
 random_state=None,
 verbose=0,
 warm_start=False,
 class_weight=None,
 ccp_alpha=0.0,
 max_samples=None,
)

데이터 출처 : https://www.kaggle.com/iabhishekofficial/mobile-price-classification?select=train.csv

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('mobile_train.csv')
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
## 결과 변수 확인
df.price_range.value_counts()

0    500
1    500
2    500
3    500
Name: price_range, dtype: int64

In [6]:
## 데이터 분할
    # stratify = 결과변수 를 설정하여 비율에 맞게 분할되도록 설정
X_train, X_test, y_train, y_test = train_test_split(df.drop('price_range',axis=1),
                                                    df.price_range, 
                                                    random_state=0, 
                                                    test_size=.2,
                                                   stratify = df['price_range'])


In [7]:
y_test.value_counts()

0    100
1    100
2    100
3    100
Name: price_range, dtype: int64

In [18]:
## 모델링
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

clf = RandomForestClassifier(n_estimators=1000,
                            n_jobs = 10,
                            random_state = 0)
clf = clf.fit(X_train, y_train)

# predict
RF_pred = clf.predict(X_test)

In [19]:
# 정확도 확인
print('accuracy', metrics.accuracy_score(np.array(y_test), RF_pred) )

accuracy 0.905


## GridSearchCV 사용
- GridSearch CV를 사용하여 랜덤 포레스트의 하이퍼 파라미터를 튜닝해보자

In [22]:
from sklearn.model_selection import GridSearchCV

param = {
    'n_estimators' : [100, 500, 1000, 2000],
    'max_depth' : [11,12,13,14,15]
}

RF_model = RandomForestClassifier(
                        random_state = 0)

grid_cv = GridSearchCV(RF_model, param_grid=param, cv=5, verbose=1, n_jobs=10)
grid_cv.fit(X_train, y_train.values)
RF_pred = grid_cv.predict(X_test)
print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

# 정확도 확인
print('accuracy', metrics.accuracy_score(np.array(y_test), RF_pred) )

Fitting 5 folds for each of 20 candidates, totalling 100 fits
최적 하이퍼 파라미터: 
 {'max_depth': 14, 'n_estimators': 2000}
최고 예측 정확도: 0.8675
accuracy 0.895
