
#### _ML продвинутые методы / ДЗ №1 / Практическая часть / Задача 1_

## 1. _RandomForest vs DecisionTree_ 

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv('wine_data.csv', header=None)
print(data.shape)
data.head()

(178, 14)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
data[0].value_counts(normalize=True, dropna=False)

2    0.398876
1    0.331461
3    0.269663
Name: 0, dtype: float64

In [5]:
y = data[0].values
X = data.drop(0, axis=1).values
print(X.shape, y.shape)

(178, 13) (178,)


In [6]:
np.unique(y, return_counts=True)

(array([1, 2, 3], dtype=int64), array([59, 71, 48], dtype=int64))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=223)

print(X_train.shape, X_test.shape)

(142, 13) (36, 13)


### 1.1 _DecisionTreeClassifier_ 

In [8]:
%%time

dtc = DecisionTreeClassifier()

dtc_grid_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : list(range(2, 10)),
    'max_features' : [0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0],
    'min_samples_leaf' : list(range(1, 15))
}

gs_dtc = GridSearchCV(dtc, dtc_grid_params, scoring='r2', cv=4, iid=True, n_jobs=-1)
gs_dtc.fit(X_train, y_train)

best_dtc = gs_dtc.best_estimator_

Wall time: 11.7 s


In [9]:
best_dtc

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=0.9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
def print_scores(clf_name, y_test, y_pred):
    print('[{}] R2 (test): {:.3f}'.format(clf_name, r2_score(y_test, y_pred)))
    print('[{}] accuracy (test): {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))
    print()
    print( classification_report(y_test, y_pred) )
    print('*' * 60)

In [11]:
y_dtc_pred = best_dtc.predict(X_test) 

In [12]:
print('[{}] R2 best (train): {:.3f}'.format('DecisionTreeClassifier', gs_dtc.best_score_))
print_scores('DecisionTreeClassifier', y_test, y_dtc_pred)

[DecisionTreeClassifier] R2 best (train): 0.953
[DecisionTreeClassifier] R2 (test): 0.711
[DecisionTreeClassifier] accuracy (test): 0.917

              precision    recall  f1-score   support

           1       0.92      1.00      0.96        12
           2       0.93      0.93      0.93        15
           3       0.88      0.78      0.82         9

   micro avg       0.92      0.92      0.92        36
   macro avg       0.91      0.90      0.91        36
weighted avg       0.92      0.92      0.91        36

************************************************************


### 1.2 _RandomForestClassifier_

Параметры для решающего дерева возьмём равными найденным выше для `DecisionTreeClassifier`.

In [13]:
%%time

rfc_params = {k:v for (k,v) in best_dtc.get_params().items() 
                        if k in ['criterion', 'max_depth','max_features', 
                                 'min_samples_leaf', 'min_samples_split']}

rfc = RandomForestClassifier(n_estimators=500, **rfc_params)

print( cross_val_score(rfc, X_train, y_train, cv=4, scoring='r2').mean() )

0.9771573604060914
Wall time: 4.02 s


In [14]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features=0.9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
y_rfc_pred = rfc.predict(X_test) 

In [16]:
print_scores('RandomForestClassifier', y_test, y_rfc_pred)

[RandomForestClassifier] R2 (test): 0.904
[RandomForestClassifier] accuracy (test): 0.944

              precision    recall  f1-score   support

           1       0.92      1.00      0.96        12
           2       1.00      0.87      0.93        15
           3       0.90      1.00      0.95         9

   micro avg       0.94      0.94      0.94        36
   macro avg       0.94      0.96      0.95        36
weighted avg       0.95      0.94      0.94        36

************************************************************


### 1.3 _Feature selection_

In [17]:
from sklearn.feature_selection import SelectFromModel

Уменьшим число признаков с помощью уже найденной модели:

In [18]:
X_reduced = SelectFromModel(rfc, prefit=True).transform(X)

X.shape, X_reduced.shape

((178, 13), (178, 4))

In [19]:
X_train_reduced, X_test_reduced = train_test_split(X_reduced, test_size=0.2, random_state=223)

print(X_train_reduced.shape, X_test_reduced.shape)

(142, 4) (36, 4)


In [20]:
%%time

rfc_reduced = RandomForestClassifier()

rfc_reduced_grid_params = {
    'n_estimators' : [ 500 ],
    'criterion' : [ 'entropy' ],
    'max_depth' : list(range(2, 11, 2)),
    'min_samples_leaf' : list(range(1, 16, 2))
}

gs_rfc_reduced = GridSearchCV(rfc_reduced, rfc_reduced_grid_params, scoring='r2', iid=True, cv=4, n_jobs=-1)
gs_rfc_reduced.fit(X_train_reduced, y_train)

best_rfc_reduced = gs_rfc_reduced.best_estimator_

Wall time: 59.4 s


In [21]:
best_rfc_reduced

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
y_rfc_pred_reduced =  best_rfc_reduced.predict(X_test_reduced) 

In [23]:
print('[{}] R2 best (train): {:.3f}'.format('RandomForestClassifier(reduced)', gs_rfc_reduced.best_score_))
print_scores('RandomForestClassifier(reduced)', y_test, y_rfc_pred_reduced)

[RandomForestClassifier(reduced)] R2 best (train): 0.942
[RandomForestClassifier(reduced)] R2 (test): 0.855
[RandomForestClassifier(reduced)] accuracy (test): 0.917

              precision    recall  f1-score   support

           1       0.92      1.00      0.96        12
           2       0.93      0.87      0.90        15
           3       0.89      0.89      0.89         9

   micro avg       0.92      0.92      0.92        36
   macro avg       0.91      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36

************************************************************
