In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


np.random.seed(42)

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df

## **Basic Data Exploration**

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.columns = [name.replace(' ', '_') for name in df.columns]
df.columns

In [None]:
plt.rcParams["figure.figsize"] = [12, 9]

In [None]:
df.plot(subplots=True)
plt.show()

## **Correlation Matrix**

In [None]:
import seaborn as sns

In [None]:
corr = df.corr()
ax = sns.heatmap(
     corr, 
     vmin=-1, vmax=1, center=0,
     cmap=sns.diverging_palette(20, 220, n=200),
     square=True,
     annot=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.plot()

In [None]:
X = df.loc[:, df.columns[:len(df.columns)-1]]
X

In [None]:
y = df[['quality']]
y

## **Testing Various Regression Models**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X = StandardScaler().fit_transform(X)  # normalization data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    shuffle=True)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
reg = LinearRegression().fit(X_train, y_train)

In [None]:
y_hat = reg.predict(X_test)

In [None]:
r2_score(y_test, y_hat)

In [None]:
from sklearn.linear_model import RidgeCV

In [None]:
reg1 = RidgeCV(alphas=[0.001, 0.01, 0.1], cv=10).fit(X, y)
reg1.score(X, y)

In [None]:
reg1.get_params()

In [None]:
reg1.alpha_

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
reg2 = ElasticNetCV(cv=10, random_state=42).fit(X, y)
reg2.score(X, y)

In [None]:
reg2.alpha_

Best $R^2$ score was given by linear model without modifications

## **Classification with Logistic Regression**

We have 6 classes, so we can predict class for red-wine in this dataset. It is possible to apply multiclass classifier, but resonable is to split marks into good and bad, as author of this dataset advices. If $mark < 6.5$ then wine quality is bad, otherwise good.

In [None]:
y.quality.unique()

In [None]:
y_lgs = y.quality.apply(lambda x: 1 if x > 6.5 else 0).to_frame()
y_lgs

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lgs,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(solver='lbfgs', random_state=4)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_hat = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

**Logistic regression classifier score:**

In [None]:
accuracy_score(y_test, y_hat)

This result is way better than we have with regression models.

## **Lets try Random Forest**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    shuffle=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_hat = clf.predict(X_test)

**Random Forest Classifier score**

In [None]:
accuracy_score(y_test, y_hat)

In [None]:
df.columns

In [None]:
feature_importance = pd.Series(clf.feature_importances_, index=df.columns[:11]).sort_values(ascending=False)
feature_importance

In [None]:
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.xlabel('Feature importance Score')
plt.ylabel('Features')
plt.title('Visualizing Important Features')
plt.legend()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

In [None]:
rf = RandomForestRegressor(random_state=42)

In [None]:
print('Parameters currently in use: \n')
pprint(rf.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

In [None]:
max_features = ['auto', 'sqrt']

In [None]:
max_depth = [int(x) for x in np.linspace(2, 14, 7)]
max_depth.append(None)

In [None]:
min_samples_split = [2, 5, 10]

In [None]:
min_samples_leaf = [1, 2, 4]

In [None]:
bootstrap = [True, False]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
rf = RandomForestRegressor()

In [None]:
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
base_model = RandomForestRegressor(n_estimators=10, random_state=42)

In [None]:
base_model.fit(X_train, y_train)

In [None]:
y_hat = base_model.predict(X_test)
base_accuracy = r2_score(y_test, y_hat)
print('Accuracy of base model:\n', base_accuracy)

In [None]:
best_random = rf_random.best_estimator_
y_hat = best_random.predict(X_test)
random_accuracy = r2_score(y_test, y_hat)
print('Accuracy of random model:\n', random_accuracy)

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

## **Practice with GridSearchCV**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [8, 10, 12, 14],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [None]:
rf = RandomForestRegressor()

In [None]:
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
y_hat = best_grid.predict(X_test)
grid_accuracy = r2_score(y_test, y_hat)
print('Accuracy of grid model:\n', grid_accuracy)

## **Bring Neural Networks to work**

Random Forest performs pretty good, but lets try another powerful model - parceptron. Maybe we can built better model with basic neural network, or ensemble of parceptrons.

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
nnrg = MLPRegressor(solver='adam', learning_rate='adaptive', random_state=42,
                    max_iter=500)

In [None]:
nnrg.fit(X_train, y_train)

In [None]:
nnrg.score(X_test, y_test)

Basic Perceptron performed pretty well, at least better than **_Linear Regression_** model, but worse than _Random Forest_, lets try to find better parameters for this model. And algorithm is not converged.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 300, 500, 1000, 1500]
}

In [None]:
nn = MLPRegressor()

In [None]:
random_search = GridSearchCV(estimator=nn,
                             param_grid=param_grid,
                             cv=3,
                             n_jobs=-1,
                             verbose=2)

In [None]:
random_search.fit(X_train, y_train)

In [None]:
random_search.best_params_

In [None]:
random_search.score(X_test, y_test)

In [None]:
abs(random_search.score(X_test, y_test) - nnrg.score(X_test, y_test))*100

With randomized search model preformed worse than with basic parameters. But maybe it can do better predictions. So, with this regressor lets try to ensamble perceptrons.

For this purpose I decided to try technic that used in Random Forest - **Bagging**, and let's use 300 models, as we used in Random Forest above.

In [None]:
from sklearn.ensemble import BaggingRegressor

I set **_max_iter_** to 1500, because I see that minimum is not found by amout of iterations equal to 700 and 1000.

In [None]:
nnb = MLPRegressor(activation='tanh',
                   alpha=0.0001,
                   hidden_layer_sizes=(100,),
                   learning_rate='adaptive',
                   solver='adam',
                   max_iter=1500,
                   random_state=42)

In [None]:
clf = BaggingRegressor(base_estimator=nnb,
                       n_estimators=300,
                       random_state=42,
                       verbose=2)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

As expected ensemble of perceptrons works worse than ensemble of decision trees, and I think that random forest is best algorithm for this task, but probably Boosting Tree(XGBoost) can perform better.