Задание 1

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
import os
print(os.listdir("../input"))

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', sep=',')
df.head(20).T

In [None]:
df.info()

Задача заключается в определении качества красного вина на основе признаков, которые у нас есть. Целевой признак target - это quality.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

y = df['quality']
X = df.drop('quality', axis=1)
X_new = scaler.fit_transform(X)
print(X_new[:5, :5])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_new,
                                                      y, 
                                                      test_size=0.2, 
                                                      random_state=626)

In [None]:
tree = DecisionTreeClassifier(max_depth=3, random_state=626)
tree.fit(X_train, y_train)

In [None]:
y_pred = tree.predict(X_valid)
tree.score(X_valid, y_valid)

In [None]:
accuracy_score(y_valid, y_pred)

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(tree, out_file='tree.dot', feature_names=X.columns)
print(open('tree.dot').read()) 

Задание 2

In [None]:
tree_params = {'min_samples_split': np.arange(50, 76)}

kf = KFold(n_splits = 5, shuffle = True, random_state = 626)
tree_grid = GridSearchCV(tree, tree_params, cv=kf, scoring='accuracy')
tree_grid.fit(X_train, y_train)

In [None]:
tree_grid.best_estimator_

In [None]:
tree_grid.best_score_

In [None]:
tree2 = DecisionTreeClassifier(min_samples_split=59, random_state=626)
tree2_params = {'min_samples_leaf': np.arange(50, 76)}

tree2_grid = GridSearchCV(tree2, tree2_params, cv=kf, scoring='accuracy')
tree2_grid.fit(X_train, y_train)

In [None]:
tree2_grid.best_estimator_

In [None]:
tree2_grid.best_score_

In [None]:
tree3 = DecisionTreeClassifier(min_samples_leaf=61, min_samples_split=59, random_state=626)
tree3_params = {'max_depth': np.arange(2, 11)}

tree3_grid = GridSearchCV(tree3, tree3_params, cv=kf, scoring='accuracy')
tree3_grid.fit(X_train, y_train)

In [None]:
tree3_grid.best_estimator_

In [None]:
tree3_grid.best_score_

In [None]:
tree4 = DecisionTreeClassifier(max_depth=4, min_samples_leaf=61, min_samples_split=59, random_state=626)
tree4_params = {'max_features': np.arange(1, 12)}

tree4_grid = GridSearchCV(tree4, tree4_params, cv=kf, scoring='accuracy')
tree4_grid.fit(X_train, y_train)


In [None]:
tree4_grid.best_estimator_

In [None]:
tree4_grid.best_score_

In [None]:
plt.figure()
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(20, 20)) 


ax[0][0].plot(tree_params['min_samples_split'], tree_grid.cv_results_['mean_test_score']) 
ax[0][0].set_xlabel('min_samples_split')
ax[0][0].set_ylabel('mean_accuracy')

ax[0][1].plot(tree2_params['min_samples_leaf'], tree2_grid.cv_results_['mean_test_score'])
ax[0][1].set_xlabel('min_samples_leaf')
ax[0][1].set_ylabel('mean_accuracy')

ax[1][0].plot(tree3_params['max_depth'], tree3_grid.cv_results_['mean_test_score']) 
ax[1][0].set_xlabel('max_depth')
ax[1][0].set_ylabel('mean_accuracy')

ax[1][1].plot(tree4_params['max_features'], tree4_grid.cv_results_['mean_test_score']) 
ax[1][1].set_xlabel('max_features')
ax[1][1].set_ylabel('mean_accuracy')

In [None]:
pd.DataFrame(tree4_grid.cv_results_).head().T

best_tree = tree4_grid.best_estimator_
y_pred = best_tree.predict(X_valid)
accuracy_score(y_valid, y_pred)

export_graphviz(best_tree, out_file='best_tree.dot', feature_names=X.columns)
print(open('best_tree.dot').read()) 

In [None]:
tree0 = DecisionTreeClassifier(max_depth=3, random_state=626)
tree0_params = {'max_depth': np.arange(2, 16), 'min_samples_split': np.arange(2, 10), 'min_samples_leaf': np.arange(1, 10), 'max_features': np.arange(1, 12)}

tree0_grid = GridSearchCV(tree0, tree0_params, cv=kf, scoring='accuracy')
tree0_grid.fit(X_train, y_train)

In [None]:
tree0_grid.best_estimator_

In [None]:
tree0_grid.best_score_

In [None]:
import matplotlib.pyplot as plt

features = {'f'+str(i+1):name for (i, name) in zip(range(len(df.columns)), df.columns)}


forest = DecisionTreeClassifier(min_samples_split=3, max_depth=11, max_features=10, random_state=626)
forest.fit(X_train, y_train)

importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

num_to_plot = 10
feature_indices = [ind+1 for ind in indices[:num_to_plot]]


print("Feature ranking:")

for f in range(num_to_plot):
    print(f+1, features["f"+str(feature_indices[f])], importances[indices[f]])

plt.figure(figsize=(15,5))
plt.title("Feature importances")
bars = plt.bar(range(num_to_plot), 
               importances[indices[:num_to_plot]],
               color=([str(i/float(num_to_plot+1)) for i in range(num_to_plot)]),
               align="center")
ticks = plt.xticks(range(num_to_plot), 
                   feature_indices)
plt.xlim([-1, num_to_plot])
plt.legend(bars, [u''.join(features["f"+str(i)]) for i in feature_indices]);


Более важным признаком является содержание спирта в вине.

Задание 3

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=626)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_valid)

accuracy_score(y_valid, y_pred)

In [None]:
rf_params = {'n_estimators': [50, 100, 200, 300, 400]}
rf_grid = GridSearchCV(rf, 
                        rf_params, 
                        scoring='accuracy',
                        cv=kf)
rf_grid.fit(X_train, y_train)

In [None]:
rf_grid.best_estimator_

In [None]:
rf_grid.best_score_

In [None]:
rf2 = RandomForestClassifier(n_estimators=300, random_state=626)
rf2_params = {'max_depth': np.arange(5, 16)}
rf2_grid = GridSearchCV(rf2, 
                        rf2_params, 
                        scoring='accuracy',
                        cv=kf)
rf2_grid.fit(X_train, y_train)

In [None]:
rf2_grid.best_estimator_

In [None]:
rf2_grid.best_score_

In [None]:
rf3 = RandomForestClassifier(max_depth=13, n_estimators=300, random_state=626)
rf3_params = {'min_samples_split': np.arange(2, 10)}
rf3_grid = GridSearchCV(rf3, 
                        rf3_params, 
                        scoring='accuracy',
                        cv=kf)
rf3_grid.fit(X_train, y_train)

In [None]:
rf3_grid.best_estimator_

In [None]:
rf3_grid.best_score_

In [None]:
rf4 = RandomForestClassifier(max_depth=13, n_estimators=300, random_state=626)
rf4_params = {'min_samples_leaf': np.arange(1, 10)}
rf4_grid = GridSearchCV(rf4, 
                        rf4_params, 
                        scoring='accuracy',
                        cv=kf)
rf4_grid.fit(X_train, y_train)

In [None]:
rf4_grid.best_estimator_

In [None]:
rf4_grid.best_score_

In [None]:
rf5 = RandomForestClassifier(max_depth=13, n_estimators=300, random_state=2019)
rf5_params = {'max_features': np.arange(1, 12)}
rf5_grid = GridSearchCV(rf5, 
                        rf5_params, 
                        scoring='accuracy',
                        cv=kf)
rf5_grid.fit(X_train, y_train)

In [None]:
rf5_grid.best_estimator_

In [None]:
rf5_grid.best_score_

In [None]:
plt.figure()
fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(15, 15)) 


ax[0][0].plot(rf_params['n_estimators'], rf_grid.cv_results_['mean_test_score']) 
ax[0][0].set_xlabel('n_estimators')
ax[0][0].set_ylabel('mean accuracy')

ax[0][1].plot(rf2_params['max_depth'], rf2_grid.cv_results_['mean_test_score'])
ax[0][1].set_xlabel('max_depth')
ax[0][1].set_ylabel('mean accuracy')

ax[1][0].plot(rf3_params['min_samples_split'], rf3_grid.cv_results_['mean_test_score']) 
ax[1][0].set_xlabel('min_samples_split')
ax[1][0].set_ylabel('mean accuracy')

ax[1][1].plot(rf4_params['min_samples_leaf'], rf4_grid.cv_results_['mean_test_score']) 
ax[1][1].set_xlabel('min_samples_leaf')
ax[1][1].set_ylabel('mean accuracy')

ax[2][0].plot(rf5_params['max_features'], rf5_grid.cv_results_['mean_test_score']) 
ax[2][0].set_xlabel('max_features')
ax[2][0].set_ylabel('mean accuracy')

In [None]:
import matplotlib.pyplot as plt

features = {'f'+str(i+1):name for (i, name) in zip(range(len(df.columns)), df.columns)}


from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(max_depth=13, max_features=3, n_estimators=300, random_state=2019)
forest.fit(X_train, y_train)

importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

num_to_plot = 10
feature_indices = [ind+1 for ind in indices[:num_to_plot]]

print("Feature ranking:")

for f in range(num_to_plot):
    print(f+1, features["f"+str(feature_indices[f])], importances[indices[f]])

plt.figure(figsize=(15,5))
plt.title("Feature importances")
bars = plt.bar(range(num_to_plot), 
               importances[indices[:num_to_plot]],
               color=([str(i/float(num_to_plot+1)) for i in range(num_to_plot)]),
               align="center")
ticks = plt.xticks(range(num_to_plot), 
                   feature_indices)
plt.xlim([-1, num_to_plot])
plt.legend(bars, [u''.join(features["f"+str(i)]) for i in feature_indices]);


Снова наиболее важным признаком является содержание спирта

Наивысшая полученная нами точность ≈ 0.608. Используя же случайный лес мы получили точность ≈ 0.686.