# Predict wine quality using regression approach

## Table of Contents
* [Import and Cleaning](#1)
* [Target](#2)
* [Explore Features](#3)
* [Target vs Features](#4)
* [3D Visualization using PCA](#5)
* [Visualization using Parallel Plot](#6)
* [Fit Model](#7)
* [Evaluate Model](#8)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# ML tools
import h2o
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators import H2OGradientBoostingEstimator

# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

<a id='1'></a>
# Import and Cleaning

In [None]:
# import and preview
df = pd.read_csv('../input/cusersmarildownloadswinecsv/wine.csv', sep=';')
df.head()

In [None]:
# structure of data frame
df.info()

### Data cleaning

In [None]:
# alcohol has some messy decimal numbers
print(df.alcohol.value_counts().index.to_list())

In [None]:
# let's fix the alcohol column:

# dict for clean-up
clean_dict = {
    '1.096.666.667' : '10.96666667', 
    '9.533.333.333' : '9.533333333',
    '1.093.333.333' : '10.93333333',
    '1.106.666.667' : '11.06666667',
    '1.053.333.333' : '10.53333333',
    '1.046.666.667' : '10.46666667',
    '1.136.666.667' : '11.36666667',
    '1.133.333.333' : '11.33333333',
    '1.143.333.333' : '11.43333333',
    '9.633.333.333' : '9.633333333',
    '9.733.333.333' : '9.733333333',
    '1.163.333.333' : '11.63333333',
    '1.206.666.667' : '12.06666667',
    '1.289.333.333' : '12.89333333'    
}

# cleanse alcohol column
df.alcohol = df.alcohol.replace(clean_dict)
df.alcohol = pd.to_numeric(df.alcohol)

<a id='2'></a>
# Target

In [None]:
# target
print(df.quality.value_counts())

# plot
df.quality.value_counts().plot(kind='bar')
plt.title('Target "quality"')
plt.grid()
plt.show()

#### In principle, we have a multiclass classification problem here. Looking at the distribution having a gaussian-like shape we nevertheless try a regression approach in the following.

<a id='3'></a>
# Explore Features

In [None]:
# numerical features
features_num = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 
                'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
                'total_sulfur_dioxide', 'density', 'pH', 'sulphates',
                'alcohol']

In [None]:
# basic stats
df[features_num].describe(include='all')

In [None]:
# pairwise scatter plot and histograms [this takes a few minutes!!!]
t1 = time.time()
sns.pairplot(df[features_num],kind='reg', 
             plot_kws={'line_kws':{'color':'magenta'},
                       'scatter_kws': {'alpha': 0.1}})
plt.show()
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2))

In [None]:
# correlations
corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

fig = plt.figure(figsize = (10,8))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (10,8))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

<a id='4'></a>
# Target vs Features

In [None]:
# plot target vs features for all features
for f in features_num:
    plt.figure(figsize=(10,5))
    plt.scatter(df[f], df.quality, alpha=0.15)
    corr_target = np.round(df[f].corr(df.quality),4)
    my_title = 'Target vs ' + f + ', corr=' + str(corr_target)
    plt.title(my_title)
    plt.grid()

### Alternative visualization - Plot feature distribution by target level

In [None]:
for f in features_num:
    plt.figure(figsize=(10,5))
    sns.violinplot(data=df, y='quality', x=f, orient='h')
    plt.title(f)
    plt.grid()
    plt.show()

<a id='5'></a>
# 3D Visualization using PCA

In [None]:
# use PCA to reduce dimension of data
df4pca = df[features_num]
# standardize first
df4pca_std = StandardScaler().fit_transform(df4pca)
# define 3D PCA
pc_model = PCA(n_components=3)
# apply PCA
pc = pc_model.fit_transform(df4pca_std)
# add to original data frame
df['pc_1'] = pc[:,0]
df['pc_2'] = pc[:,1]
df['pc_3'] = pc[:,2]
# show extended data frame
df.head()

In [None]:
# interactive plot
df['quality_cat'] = df.quality.astype('category') # add categorical version of quality for plot
df['size'] = 1
fig = px.scatter_3d(df, x='pc_1', y='pc_2', z='pc_3',
                    color='quality_cat',
                    size='size',
                    size_max=10,
                    opacity=0.5)
fig.update_layout(title='PCA 3D')
fig.show()

### Well, it does not seem easy to separate the classes here...

<a id='6'></a>
# Visualization using Parallel Plot

In [None]:
# parallel plot of features and target
fig = px.parallel_coordinates(df[features_num + ['quality']], color='quality',
                             title='Parallel Plot (all features)')
fig.show()

In [None]:
# parallel plot using just a subset of features
fig = px.parallel_coordinates(df[['alcohol','volatile_acidity','free_sulfur_dioxide','chlorides','quality']],
                              color='quality',
                              title='Parallel Plot (feature subset)')
fig.show()

In [None]:
# parallel plot using PCA features only
fig = px.parallel_coordinates(df[['pc_1','pc_2','pc_3','quality']],
                              color='quality',
                              title='Parallel Plot (PCA features only)')
fig.show()

<a id='7'></a>
# Fit Model

In [None]:
# select predictors
predictors = features_num
print('Number of predictors: ', len(predictors))
print(predictors)

# define target
target='quality'

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frame in H2O environment
df_hex = h2o.H2OFrame(df)

# train / test split
train_perc = 0.7
train_hex, test_hex = df_hex.split_frame(ratios=[0.7], seed=999)

In [None]:
# define Gradient Boosting model
n_cv = 5
fit_1 = H2OGradientBoostingEstimator(ntrees = 50,
                                     max_depth=6,
                                     min_rows=5,
                                     sample_rate=1,
                                     col_sample_rate=0.5,
                                     nfolds=n_cv,
                                     seed=999)

In [None]:
# train model
t1 = time.time()
fit_1.train(x=predictors,
            y=target,
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_1.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [RMSE]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_rmse, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_rmse, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.legend()
    plt.grid()
    plt.show()

<a id='8'></a>
# Evaluate Model

### Variable Importance

In [None]:
# basic version
fit_1.varimp_plot()

In [None]:
# variable importance using SHAP => see direction as well as severity of feature impact
t1 = time.time()
fit_1.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

### Evaluate on training data

In [None]:
# predict on training data
pred_train = fit_1.predict(train_hex)
y_train_act = train_hex.as_data_frame()[target].values # actuals
y_train_pred = pred_train.as_data_frame().predict.values # predictions
# wrap results in data frame
df_train_eval = pd.DataFrame({'Actual' : y_train_act,
                              'PredNum' : y_train_pred})

In [None]:
# plot predictions vs actuals
p=sns.jointplot(data=df_train_eval,
                x='Actual', y='PredNum',
                joint_kws={'alpha' : 0.15})
p.fig.suptitle('Prediction vs Actual - Training Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

#### Regression => Classification:

In [None]:
# we have to map the continuous values from our regression exercise to the discrete classes now
y_train_pred_class = np.round(y_train_pred,0).astype(int)
# also add to data frame
df_train_eval['PredClass'] = y_train_pred_class
df_train_eval.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_train = pd.crosstab(df_train_eval.Actual, df_train_eval.PredClass)
# visualize
sns.heatmap(conf_train, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

### Evaluate on test set

In [None]:
# predict on test data
pred_test = fit_1.predict(test_hex)
y_test_act = test_hex.as_data_frame()[target].values # actual values
y_test_pred = pred_test.as_data_frame().predict.values # predictions
# wrap results in data frame
df_test_eval = pd.DataFrame({'Actual' : y_test_act,
                             'PredNum' : y_test_pred})

In [None]:
# plot predictions vs actuals
p=sns.jointplot(data=df_test_eval,
                x='Actual', y='PredNum',
                joint_kws={'alpha' : 0.15})
p.fig.suptitle('Prediction vs Actual - Test Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

#### Regression => Classification:

In [None]:
# map the continuous values to classes again
y_test_pred_class = np.round(y_test_pred,0).astype(int)
# also add to data frame
df_test_eval['PredClass'] = y_test_pred_class
df_test_eval.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_test = pd.crosstab(df_test_eval.Actual, df_test_eval.PredClass)
# visualize
sns.heatmap(conf_test, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()