In [1]:
import pandas as pandas
import numpy as np


filepath = "winequality-red.csv"
df = pandas.read_csv(filepath, sep=";", header=None, names=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']);
df = df.iloc[1: , :]
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
2,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
3,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
4,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
5,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


In [2]:
from sklearn.model_selection import train_test_split


target = np.array(df.pop('quality'))
train, test, train_target, test_target = train_test_split(df, target, stratify = target, test_size = 0.2, random_state = 69)

In [3]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(random_state=69)
tree.fit(train, train_target)
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}.')

Decision tree has 657 nodes with maximum depth 20.


In [4]:
from sklearn.metrics import mean_squared_error


tree_predictions = tree.predict(test)
tree_mse = mean_squared_error(test_target, tree_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.7685213074469699

In [5]:
from sklearn.ensemble import RandomForestRegressor



model = RandomForestRegressor(n_estimators=500, bootstrap=True, oob_score=True, n_jobs=-1, random_state=69)
model.fit(train, train_target)



RandomForestRegressor(n_estimators=500, n_jobs=-1, oob_score=True,
                      random_state=69)

In [6]:
n_nodes = []
max_depths = []

for ind_tree in model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Mean number of nodes {int(np.mean(n_nodes))}')
print(f'Mean maximum depth {int(np.mean(max_depths))}')

Mean number of nodes 468
Mean maximum depth 19


In [7]:
features = list(train.columns)


fi_model = pandas.DataFrame({'feature': features,
                   'importance': model.feature_importances_}).\
                    sort_values('importance', ascending = False)
fi_model.head(10)

Unnamed: 0,feature,importance
10,alcohol,0.301747
9,sulphates,0.128892
1,volatile acidity,0.117953
6,total sulfur dioxide,0.078374
4,chlorides,0.058675
3,residual sugar,0.05569
7,density,0.053452
0,fixed acidity,0.052953
8,pH,0.052159
5,free sulfur dioxide,0.052064


In [8]:


model_predictions = model.predict(test)
model_mse = mean_squared_error(test_target, model_predictions)
model_rmse = np.sqrt(model_mse)
model_rmse

0.576392748045983

In [9]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

param_grid = {
    'n_estimators': np.linspace(10, 500).astype(int),
    'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.linspace(0.1, 1)),
    'max_leaf_nodes': [None] + list(np.linspace(5, 10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10, 15, 20],
    'bootstrap': [True, False]
}

#param_grid = {
#    'n_estimators': [100, 500],
#    'max_depth': [None, 3, 20],
#    'max_features': ['auto', 'sqrt', None, 0.5, 1],
#    'max_leaf_nodes': [None, 10, 100, 500],
#    'min_samples_split': [5, 10, 20],
#    'bootstrap': [True, False]
#}

estimator = RandomForestRegressor(random_state = 69)

rs = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, cv = 5, n_iter = 100, verbose = 1, random_state=69)

#rs = GridSearchCV(RandomForestRegressor(random_state=69), param_grid, n_jobs=-1, cv=5)

rs.fit(train, train_target)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=69),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [None, 3, 3, 3, 4, 4, 4, 5,
                                                      5, 5, 6, 6, 6, 7, 7, 7, 8,
                                                      8, 8, 9, 9, 9, 10, 10, 10,
                                                      11, 11, 12, 12, 12, ...],
                                        'max_features': ['auto', 'sqrt', None,
                                                         0.1,
                                                         0.11836734693877551,
                                                         0.13673469387755102,
                                                         0.15510204081632656,
                                                         0.17346938775510207,
                                       

In [10]:
best_model = rs.best_estimator_
rs.best_params_

{'n_estimators': 370,
 'min_samples_split': 20,
 'max_leaf_nodes': None,
 'max_features': 0.11836734693877551,
 'max_depth': 18,
 'bootstrap': True}

In [11]:
n_nodes = []
max_depths = []

for ind_tree in best_model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Mean number of nodes {int(np.mean(n_nodes))}')
print(f'Mean maximum depth {int(np.mean(max_depths))}')


Mean number of nodes 179
Mean maximum depth 16


In [12]:
fi_rs = pandas.DataFrame({'feature': features,
                   'importance': best_model.feature_importances_}).\
                    sort_values('importance', ascending = False)
fi_rs.head(10)

Unnamed: 0,feature,importance
10,alcohol,0.176443
9,sulphates,0.127581
1,volatile acidity,0.120125
7,density,0.098908
2,citric acid,0.086103
6,total sulfur dioxide,0.081388
4,chlorides,0.074808
0,fixed acidity,0.064837
8,pH,0.060593
3,residual sugar,0.055595


In [13]:


rs_predictions = rs.predict(test)
rs_mse = mean_squared_error(test_target, rs_predictions)
rs_rmse = np.sqrt(rs_mse)
rs_rmse

0.6176028962783006

In [14]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=10, n_estimators=500, random_state=69)
gbrt.fit(train, train_target)

errors = [mean_squared_error(test_target, gbrt_prediction) for gbrt_prediction in gbrt.staged_predict(test)]

bst_n_estimators = np.argmin(errors) + 1

bst_n_estimators

62

In [15]:
gbrt_best = GradientBoostingRegressor(max_depth=10, n_estimators=bst_n_estimators, random_state=69)
gbrt_best.fit(train, train_target)

GradientBoostingRegressor(max_depth=10, n_estimators=62, random_state=69)

In [16]:
gbrt_predictions = gbrt.predict(test)
gbrt_mse = mean_squared_error(test_target, gbrt_predictions)
gbrt_rmse = np.sqrt(gbrt_mse)
gbrt_rmse

0.5925462919219496