This notebook includes:
- EDA
- Outliers removal
- Correlation
- Featuer Engineering
- PCA decoposition
- Comparing baseline models
- Model training and predictions

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

# EDA

In [None]:
# Check missing values
data.isnull().sum()

No missing values

In [None]:
data.head()

In [None]:
data.info()

This dataset onlu has numerical data, it doesn't contain any categorical data

## The target value: quality

In [None]:
from scipy.stats import norm
from scipy import stats
import seaborn as sns

# Style
sns.set_style('white')
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(10, 6))

# Distibution plot
sns.distplot(data['quality'], color='b', fit=norm)

# Mean and variance
mu, sigma = norm.fit(data['quality'])


# Plot Details
plt.legend(["Normal dist. ($\mu=$ {:.4f} and $\sigma=$ {:.4f})".format(mu, sigma)], loc='best')
ax.axes.grid(False)
plt.ylabel('Frequency', fontsize=12)
plt.xlabel('Quality', fontsize=12)
plt.title('Wine Quality Distribution')
sns.despine(trim=True, left=True)


plt.show()

## Skewness and Kurtosis

In [None]:
print("Skewness: {:.4f}".format(data['quality'].skew()))
print("Kurtosis: {:.4f}".format(data['quality'].kurt()))

The skewness of quality is close to zero, meaning the tails on both sides of the mean balance out overall.
The kurtosis is just 0.2967 (less than 3), Distributions with kurtosis less than 3 are said to be platykurtic, although this does not imply the distribution is "flat-topped" as is sometimes stated. Rather, it means the distribution produces fewer and less extreme outliers than does the normal distribution.

## QQ-plot
- This plot provides a summary of whether the distributions of two variables are similar or not with respect to the locations.

In [None]:
fig = plt.figure()
res = stats.probplot(data['quality'], plot=plt)

plt.show()

## Numerical Data

In [None]:
# Look at numerical data
data.describe()

In [None]:
data.hist(figsize=(20, 15))

plt.show()

# Check outliers

In [None]:
# Scatter plots of features vs Target value
fix, axs = plt.subplots(ncols=2, nrows=0, figsize=(20, 50))
sns.color_palette('husl', 8)

for i, feature in enumerate(data.columns, 1):
    plt.subplot(len(data.columns), 3, i)
    sns.scatterplot(x=feature, y='quality', hue='quality', data=data)
    plt.xlabel('{}'.format(feature))
    plt.ylabel('Quality')
    plt.legend(loc='best', prop={'size': 10})
    
    for j in range(2):
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
    
plt.show()

General observations: The lower the volatile acidity; The higher the quality; The higher the alcohol, the higher the quality.
Details have to be confirmed with correlation studies.

Outliers: There are many outliers in each feature. We will remove them by IQR method in the following

# Correlation Matrix

In [None]:
corr_matrix = data.corr()

# Find out which feature has the highest correlation with target
corr_matrix['quality'].sort_values(ascending=False)

In [None]:
plt.subplots(figsize=(10,8))
sns.heatmap(corr_matrix, vmax=0.8, annot=True)

- Features with positive correlations: alcohol (0.4762) >> sulphates (0.2514) > citric acid (0.2264) >> other features
- Features with negative correlations: volatile acidity (-0.3906) << total sulfur dioxide (-0.1851) < density (-0.1749) << other features 


# Baseline Models 1
- With NO Feature Engineering
- Simple Linear Regression

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold

data_base = data.copy()

X = data_base.drop('quality', axis=1).values
y = np.log1p(data_base['quality']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

# Useful function to display scores
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean of scores: {:.4f}".format(scores.mean()))
    print("Standard Deviation of scores: {:.4f}".format(scores.std()))
    print("\n")

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

lin_reg = LinearRegression()
ridge_reg = Ridge(random_state=42)
lasso_reg = Lasso(random_state=42)
elasticnet_reg = ElasticNet(random_state=42)
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor(random_state=42)
gb_reg = GradientBoostingRegressor(random_state=42)
xgb_reg = XGBRegressor(random_state=42)
lgmb_reg = LGBMRegressor(random_state=42)

regressors = [lin_reg, ridge_reg, lasso_reg, elasticnet_reg, tree_reg, forest_reg, gb_reg, xgb_reg, lgmb_reg]

In [None]:
basemodels_performance = []
for i in regressors:
    start_time = datetime.now()
    model_performance = {}
    model_performance['model'] = type(i).__name__
    scores = np.sqrt(-cross_val_score(i, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error'))
#     model_performance['scores'] = scores
    model_performance['mean_score'] = round(scores.mean(), 4)
    model_performance['standard_deviation'] = round(scores.std(), 4)
    time_used = datetime.now() - start_time
    model_performance['time_used'] = time_used.total_seconds()
    basemodels_performance.append(model_performance)

df_basemodels_performance = pd.DataFrame(basemodels_performance)
df_basemodels_performance

# Baseline Models 2
- With NO Feature Engineering
- Simple Linear Regression
- Only select highly correlated featuers: alcohol, sulphates, citric acid, volatile acidity


In [None]:
X = data_base[['alcohol', 'sulphates', 'citric acid', 'volatile acidity']].values
y = np.log1p(data_base['quality']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
lin_reg = LinearRegression()
ridge_reg = Ridge(random_state=42)
lasso_reg = Lasso(random_state=42)
elasticnet_reg = ElasticNet(random_state=42)
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor(random_state=42)
gb_reg = GradientBoostingRegressor(random_state=42)
xgb_reg = XGBRegressor(random_state=42)
lgmb_reg = LGBMRegressor(random_state=42)

regressors = [lin_reg, ridge_reg, lasso_reg, elasticnet_reg, tree_reg, forest_reg, gb_reg, xgb_reg, lgmb_reg]

basemodels_performance = []
for i in regressors:
    start_time = datetime.now()
    model_performance = {}
    model_performance['model'] = type(i).__name__
    scores = np.sqrt(-cross_val_score(i, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error'))
#     model_performance['scores'] = scores
    model_performance['mean_score'] = round(scores.mean(), 4)
    model_performance['standard_deviation'] = round(scores.std(), 4)
    time_used = datetime.now() - start_time
    model_performance['time_used'] = time_used.total_seconds()
    basemodels_performance.append(model_performance)

df_basemodels_performance = pd.DataFrame(basemodels_performance)
df_basemodels_performance

Baseline Models 1 have better performance

# Feature Engineering

## Remove outliers by Standard Deviation Method
- First, calculate the mean and standard deviation of data field
- Second, remove outliers that are lower than Mean - 3 * S/D or greater than Mean + 3 * S/D

In [None]:
features = [c for c in data.columns if c != 'quality']

In [None]:
print("Number of instances before removing outliers: ", len(data), "\n")

instances = len(data)

for feature in features:
    feature_values = data[feature].values
    mean = feature_values.mean()
    std = feature_values.std()
    lowerbound, upperbound = mean - 3*std, mean + 3*std
    print("Feature: %s" % feature)
    print("Mean: {:.4f}".format(mean), "\tS.D: {:.4f}".format(std))
    print("Lower bound: {:.4f}".format(lowerbound), "\tUpper bound: {:.4f}".format(upperbound))
    outliers = [x for x in feature_values if (x < lowerbound or x > upperbound)]
    print("Outliers: ", outliers)
    print("Remove {} outliers".format(len(outliers)))
    data = data.drop(data[(data[feature]<lowerbound)|(data[feature]>upperbound)].index)
    print("Instances left: {}".format(len(data)))
    print("\n")

## Plot scatters again to validate the results after outliers removal

In [None]:
# Scatter plots of features vs Target value
fix, axs = plt.subplots(ncols=2, nrows=0, figsize=(20, 50))
sns.color_palette('husl', 8)

for i, feature in enumerate(data.columns, 1):
    plt.subplot(len(data.columns), 3, i)
    sns.scatterplot(x=feature, y='quality', hue='quality', palette='Blues', data=data)
    plt.xlabel('{}'.format(feature))
    plt.ylabel('Quality')
    plt.legend(loc='best', prop={'size': 10})
    
    for j in range(2):
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
    
plt.show()

The data looks much better after removing outliers

# Normalization
- Using sklearn StandarScaler

In [None]:
X = data.drop('quality', axis=1).values
y = np.log1p(data['quality']).values

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X=X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

In [None]:
lin_reg = LinearRegression()
ridge_reg = Ridge(random_state=42)
lasso_reg = Lasso(random_state=42)
elasticnet_reg = ElasticNet(random_state=42)
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor(random_state=42)
gb_reg = GradientBoostingRegressor(random_state=42)
xgb_reg = XGBRegressor(random_state=42)
lgmb_reg = LGBMRegressor(random_state=42)

regressors = [lin_reg, ridge_reg, lasso_reg, elasticnet_reg, tree_reg, forest_reg, gb_reg, xgb_reg, lgmb_reg]

models_performance = []

for i in regressors:
    start_time = datetime.now()
    model_performance = {}
    model_performance['model'] = type(i).__name__
    scores = np.sqrt(-cross_val_score(i, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error'))
#     model_performance['scores'] = scores
    model_performance['mean_score'] = round(scores.mean(), 4)
    model_performance['standard_deviation'] = round(scores.std(), 4)
    time_used = datetime.now() - start_time
    model_performance['time_used'] = time_used.total_seconds()
    models_performance.append(model_performance)

df_models_performance = pd.DataFrame(models_performance)
df_models_performance

# PCA for dimensionality reduction
- Find the right number of dimensions (d) to reduce down to, where the reduced data can explained over 95% of original data's variance
- Then set PCA(n_components=d)

In [None]:
X = data.drop('quality', axis=1).values
y = np.log1p(data['quality']).values

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)

d = np.argmax(cumsum >= 0.95) + 1

print("Original dataset dimensions: %s" % X.shape[1])
print("The optimal dimensions to reduce down to: %s" %d)

In [None]:
# Reduce
pca = PCA(n_components=d)
X_reduced = pca.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size = 0.2, random_state = 42)

# Models with PCA

In [None]:
lin_reg = LinearRegression()
ridge_reg = Ridge(random_state=42)
lasso_reg = Lasso(random_state=42)
elasticnet_reg = ElasticNet(random_state=42)
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor(random_state=42)
gb_reg = GradientBoostingRegressor(random_state=42)
xgb_reg = XGBRegressor(random_state=42)
lgmb_reg = LGBMRegressor(random_state=42)

regressors = [lin_reg, ridge_reg, lasso_reg, elasticnet_reg, tree_reg, forest_reg, gb_reg, xgb_reg, lgmb_reg]

models_performance = []

for i in regressors:
    start_time = datetime.now()
    model_performance = {}
    model_performance['model'] = type(i).__name__
    scores = np.sqrt(-cross_val_score(i, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error'))
#     model_performance['scores'] = scores
    model_performance['mean_score'] = round(scores.mean(), 4)
    model_performance['standard_deviation'] = round(scores.std(), 4)
    time_used = datetime.now() - start_time
    model_performance['time_used'] = time_used.total_seconds()
    models_performance.append(model_performance)

df_models_performance = pd.DataFrame(models_performance)
df_models_performance

# Models without PCA (Better than PCA version)

In [None]:
X = data.drop('quality', axis=1).values
y = np.log1p(data['quality']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
lin_reg = LinearRegression()
ridge_reg = Ridge(random_state=42)
lasso_reg = Lasso(random_state=42)
elasticnet_reg = ElasticNet(random_state=42)
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor(random_state=42)
gb_reg = GradientBoostingRegressor(random_state=42)
xgb_reg = XGBRegressor(random_state=42)
lgmb_reg = LGBMRegressor(random_state=42)

regressors = [lin_reg, ridge_reg, lasso_reg, elasticnet_reg, tree_reg, forest_reg, gb_reg, xgb_reg, lgmb_reg]

models_performance = []

for i in regressors:
    start_time = datetime.now()
    model_performance = {}
    model_performance['model'] = type(i).__name__
    scores = np.sqrt(-cross_val_score(i, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error'))
#     model_performance['scores'] = scores
    model_performance['mean_score'] = round(scores.mean(), 4)
    model_performance['standard_deviation'] = round(scores.std(), 4)
    time_used = datetime.now() - start_time
    model_performance['time_used'] = time_used.total_seconds()
    models_performance.append(model_performance)

df_models_performance = pd.DataFrame(models_performance)
df_models_performance

# Evaluation
- Baseline Models 1: Random Forest Regressor (without any feature engineering) has the better performance with the lowest RMSE at 0.0936

# Model Training

In [None]:
X = data_base.drop('quality', axis=1).values
y = np.log1p(data_base['quality']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_train, y_train)

# Predictions on test set

In [None]:
y_pred = forest_reg.predict(X_test)

In [None]:
# Convert back to integers
predictions = pd.DataFrame(list(zip(np.expm1(y_test), np.round(np.expm1(y_pred)))), columns=['y_test', 'y_pred'])
predictions[:20]

# Conclusions & Future work
- The results of RMSE are not really satisfying, the problem may be due to wrong linearity assumption, such that regression is the not the best wasy to solve this problem
- Future works can be: 
    - Hyperparameters Tunning with Grid Search CV/ Randomized Search CV
    - Ensemble modeling
    - Multi-class classification