# Decision Trees & Random Forests

Практический ноутбук по деревьям решений и композициям таких алгоритмов.

На этом занятии мы будем работать с тем же набором данных, что и в занятии про Knn.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('../data/Pokemon.csv', index_col=0).reset_index(drop=True)

In [None]:
data.head()

## Train Test Split

Преобразовывать категориальные данные мы уже научились. Поэтому пока оставим эти признаки и поработаем прсто с числовыми.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
num_cols = [col for col in data.columns if data[col].dtype == 'float64' or data[col].dtype == 'int64']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[num_cols], 
                                                    data['Legendary'], test_size=0.2, 
                                                    random_state=42)

## Decision Trees

Начнем с деревьев решений.

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier(random_state=42)

In [None]:
dtree.fit(X_train, y_train)

## Предсказание и оценка

In [None]:
predictions = dtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(y_test*1, predictions))

In [None]:
print(confusion_matrix(y_test, predictions))

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score
print('Roc-auc:', roc_auc_score(y_test, predictions))
print('Accuracy:', accuracy_score(y_test, predictions))

In [None]:
predictions = dtree.predict_proba(X_test)[:, 1]
print('Roc-auc:', roc_auc_score(y_test, predictions))
print('Accuracy:', accuracy_score(y_test, predictions))

In [None]:
dtree2 = DecisionTreeClassifier(random_state=42, min_samples_split=10)
dtree2.fit(X_train, y_train)
predictions2 = dtree2.predict(X_test)
print(classification_report(y_test, predictions2))
print(confusion_matrix(y_test, predictions2))
print('Roc-auc:', roc_auc_score(y_test, predictions2))
print('Accuracy:', accuracy_score(y_test, predictions2))
predictions2 = dtree2.predict_proba(X_test)[:, 1]
print('Roc-auc:', roc_auc_score(y_test, predictions2))

## Визуализация деревьев

In [None]:
from sklearn.tree import export_graphviz

In [None]:
export_graphviz(dtree, feature_names=num_cols, out_file='small_tree3.dot', filled=True)

# http://www.webgraphviz.com/

## Random Forests

Сравним результаты одного дерева и композиции деревьев.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test, rfc_pred))

In [None]:
print(classification_report(y_test, rfc_pred))

In [None]:
print(roc_auc_score(y_test, rfc_pred))
rfc_pred = rfc.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, rfc_pred))

In [None]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [None]:
cv = KFold(n_splits=3, shuffle=True, random_state=42)

gs = GridSearchCV(RandomForestClassifier(n_estimators=100, random_state=42),
                  param_grid={'max_features': [None, 'log2', 'sqrt'], 
                              'max_depth': [4, 6, 8]},
                  cv=cv,
                  scoring='roc_auc')
gs.fit(data[num_cols],  data['Legendary'])

In [None]:
gs.best_score_

In [None]:
gs.best_estimator_

In [None]:
gs.best_params_

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42, 
                             max_depth=8, max_features='log2')
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, rfc_pred))