## EDA with Wines dataset 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import math
from random import seed
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.ensemble import RandomForestRegressor, BaggingClassifier
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
import warnings
import graphviz
import shap
%matplotlib inline
warnings.filterwarnings("ignore")

In [None]:
seed(2021)
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

Its a DataFrame object with 1599 rows and 12 columns.

In [None]:
data.shape

The columns are:

In [None]:
data.columns

More informations about types of columns:

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.describe()

Fiew first rows of dataset:

In [None]:
data.head()

In [None]:
independent_variables = [var for var in data.columns if var not in ['quality']]
pd.plotting.scatter_matrix(data[independent_variables[0:5]], figsize = (20, 10),  marker = 'D');

In [None]:
pd.plotting.scatter_matrix(data[independent_variables[6:11]], figsize = (20, 10),  marker = 'D');

### Target as a dependent variable

In [None]:
data['quality'].value_counts(normalize = True).reset_index().rename(columns = {'index': 'y_variable', 'quality': 'percentage_count'}).sort_values(by = 'y_variable')

#### Which of the variables show the most relationship with the target variable ?

In [None]:
fig = px.parallel_coordinates(data, color = 'quality',
                              dimensions = independent_variables,
                              color_continuous_midpoint=5)
fig.show()

#### Tips from [here!](<https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009>)
<em>"What might be an interesting thing to do, is aside from using regression modelling, is to set an arbitrary cutoff for your dependent variable (wine quality) at e.g. 7 or higher getting classified as 'good/1' and the remainder as 'not good/0'."</em>

In [None]:
data.groupby('quality').agg(['mean', 'median', 'std'])

In [None]:
data['target'] = np.where(data['quality'] >= 7, 'good', 'not good')
data['y'] = np.where(data['target'] == 'not good', 0 ,1)
independent_variables = [var for var in data.columns if var not in ['quality', 'y', 'target']]
data['target'].value_counts(normalize = True)

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['fixed acidity'], width = 0.5, ax= axes[0]).set_title('Boxplot of fixed acidity')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['fixed acidity'], dodge=True, ax= axes[1]).set_title('Stripplot of fixed acidity depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['volatile acidity'], width = 0.5, ax= axes[0]).set_title('Boxplot of volatile acidity')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['volatile acidity'], dodge=True, ax= axes[1]).set_title('Stripplot of volatile acidity depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['citric acid'], width = 0.5, ax= axes[0]).set_title('Boxplot of citric acid')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['citric acid'], dodge=True, ax= axes[1]).set_title('Stripplot of citric acid depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['residual sugar'], width = 0.5, ax= axes[0]).set_title('Boxplot of residual sugar')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['residual sugar'], dodge=True, ax= axes[1]).set_title('Stripplot of residual sugar depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['chlorides'], width = 0.5, ax= axes[0]).set_title('Boxplot of chlorides')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['chlorides'], dodge=True, ax= axes[1]).set_title('Stripplot of chlorides depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['free sulfur dioxide'], width = 0.5, ax= axes[0]).set_title('Boxplot of free sulfur dioxide')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['free sulfur dioxide'], dodge=True, ax= axes[1]).set_title('Stripplot of free sulfur dioxide depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['total sulfur dioxide'], width = 0.5, ax= axes[0]).set_title('Boxplot of total sulfur dioxide')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['total sulfur dioxide'], dodge=True, ax= axes[1]).set_title('Stripplot of total sulfur dioxide depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['density'], width = 0.5, ax= axes[0]).set_title('Boxplot of density')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['density'], dodge=True, ax= axes[1]).set_title('Stripplot of density depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['pH'], width = 0.5, ax= axes[0]).set_title('Boxplot of pH')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['pH'], dodge=True, ax= axes[1]).set_title('Stripplot of pH depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['sulphates'], width = 0.5, ax= axes[0]).set_title('Boxplot of sulphates')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['sulphates'], dodge=True, ax= axes[1]).set_title('Stripplot of sulphates depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

sns.boxplot(x = data['target'], y = data['alcohol'], width = 0.5, ax= axes[0]).set_title('Boxplot of alcohol')
axes[0].yaxis.tick_left()

sns.stripplot(x = data['quality'], y = data['alcohol'], dodge=True, ax= axes[1]).set_title('Stripplot of alcohol depend on quality')
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
plt.show()

### Correlations between independent variables

In [None]:
correlation_mat = data[independent_variables].corr()
fig, ax = plt.subplots(figsize=(15, 8))
sns.heatmap(correlation_mat, cmap='coolwarm', annot=True, fmt=".2f").set_title('Pearson correlation between independent variables')
plt.show()

In [None]:
corr_pairs = correlation_mat.unstack()
corr_pairs.sort_values(kind="quicksort", inplace = True)
corr_pairs

### The strong correlations (independent variables)

In [None]:
corr_pairs[(abs(corr_pairs) > 0.6) & (corr_pairs !=1)]

### Rank correlation (Spearman)

In [None]:
spearman_correelation = data.corr(method='spearman')['quality'].reset_index().rename(columns = {'index': 'independent variables', 'quality': 'spearman correlation with quality'})
spearman_correelation.drop([11], inplace = True)
spearman_correelation.sort_values(by = 'spearman correlation with quality', key = abs, ascending = False)

### Decision Trees

In [None]:
X = data[independent_variables]
target = data['target']
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=21)

In [None]:
print('Share of good wines in train sample: '+ str(round((y_train.sum()/y_train.shape[0]),3)*100 )+'%')
print('Share of good wines in test sample: '+ str(round((y_test.sum()/y_test.shape[0]),2)*100 )+'%')

In [None]:
tree_example = DecisionTreeClassifier(max_depth = 3)
tree_example.fit(X_train, y_train)

In [None]:
print('Acuraccy on train sample for simple decision tree is: {}'.format(tree_example.score(X_train, y_train)))

In [None]:
tree_exp_predictions = tree_example.predict_proba(X_test)[:,1]
print('Acuraccy on test sample for simple decision tree is: {}'.format(roc_auc_score(y_test, tree_exp_predictions)))

In [None]:
dot_data = export_graphviz(tree_example, out_file=None,
                           feature_names=independent_variables,
                           class_names= np.array(['good','not good']),
                           filled=True)

graph = graphviz.Source(dot_data, format="png") 
graph

### Decision Tree with the best combination of the parameter using grid search method

In [None]:
param_dict = {
    "criterion": ['gini', 'entropy'],
    "max_depth": range(1,8),
    "min_samples_split": range(5,8),
    "min_samples_leaf": range(1,5)
}

In [None]:
grid = GridSearchCV(tree_example,
                    param_grid = param_dict,
                    cv = 5
                   )
grid.fit(X_train, y_train)

In [None]:
print('Best parameters from GridSearchCV: {}'.format(grid.best_params_))

In [None]:
print('Acuraccy in train sample: {}'.format(grid.best_score_))

In [None]:
tree_predictions = grid.best_estimator_.predict_proba(X_test)[:,1]

In [None]:
auc = roc_auc_score(y_test, tree_predictions)
print('AUC on test sample: %.2f' % auc)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, tree_predictions)

In [None]:
def plot_roc_curve(fpr, tpr, kind_of_sample = 'test'):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve'+ ', sample: '+ kind_of_sample)
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(fpr, tpr)

### How the GridSearchCV works? 

Here are configurations in GridSearchCV evaluated using 5-fold cross validation:

In [None]:
means_score = grid.cv_results_['mean_test_score']
stds_score = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']

for mean, stdev, param in zip(means_score, stds_score, params):
    print("score: %f std: (%f) with: %r" % (mean, stdev, param))

### Random Forest and RandomizedSearchCV method

In [None]:
rf_model = RandomForestRegressor(random_state = 2021, min_samples_leaf = 5, oob_score = True)

In [None]:
number_of_trees = [int(x) for x in np.linspace(100,1000,6)]
number_of_levels_tree = [int(x) for x in np.linspace(5,15,8)]
max_features = ['sqrt', 'log2']
min_samples_split = [10, 15]

In [None]:
random_grid = {'n_estimators': number_of_trees,
               'max_depth': number_of_levels_tree,
               'max_features': max_features,
               'min_samples_split': min_samples_split
              }

In [None]:
rf_randomsCV = RandomizedSearchCV(estimator=rf_model, param_distributions=random_grid,
                              cv = 5, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)
rf_randomsCV.fit(X_train, y_train);

In [None]:
print('Best parameters from RandomizedSearchC: {}'.format(rf_randomsCV.best_params_))

In [None]:
forest_importances = pd.Series(rf_randomsCV.best_estimator_.feature_importances_, index=independent_variables)
fig, ax = plt.subplots()
forest_importances.plot.bar(ax = ax)
plt.title('Features importance')
fig.tight_layout()
plt.show()

In [None]:
rf_predictions = rf_randomsCV.best_estimator_.predict(X_test)
auc = roc_auc_score(y_test, rf_predictions)
print('AUC on test sample: %.2f' % auc)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, rf_predictions)
plot_roc_curve(fpr, tpr)

### Bagging

I am using the best parameters from the decision tree from a previous model (using GridSearchCV):   
Best parameters from GridSearchCV: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 5}

In [None]:
dt_model = grid.best_estimator_

In [None]:
bag_model = BaggingClassifier(base_estimator= dt_model,random_state = 2021, oob_score = True, max_features = 1.0)

random_grid_bag = {'n_estimators': number_of_trees}

bag_randomsCV = GridSearchCV(estimator=bag_model, param_grid=random_grid_bag,
                              cv = 5, verbose=2, n_jobs=-1,
                              return_train_score=True)
bag_randomsCV.fit(X_train, y_train);

In [None]:
print('Best parameters from GridSearchCV: {}'.format(bag_randomsCV.best_params_))

Here we see, that the mean accuracy on the training sample is lower than the mean accuracy of the testing sample.   That's can be overfitting. We have to remember that the main limitation in the bagging algorithm is using the entire feature space when creating splits in each tree. It is possible, especially when some main feature is indicating certain predictions. There is a risk of having a forest of correlated trees.

In [None]:
bag_randomsCV.best_score_

In [None]:
bag_predictions = bag_randomsCV.best_estimator_.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, bag_predictions)
print('AUC on test sample: %.2f' % auc)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, bag_predictions)
plot_roc_curve(fpr, tpr)

The second thought is why the best parameter: n_estimators is so low.

In [None]:
bag_means_score = bag_randomsCV.cv_results_['mean_test_score']
bag_stds_score = bag_randomsCV.cv_results_['std_test_score']
bag_params = bag_randomsCV.cv_results_['params']

for mean, stdev, param in zip(bag_means_score, bag_stds_score, bag_params):
    print("score: %f std: (%f) with: %r" % (mean, stdev, param))


### ExtraTrees

### Getting balanced data -  attempts of some improvements

### Explaining Model Predictions

#### Explaining Model Predictions - Random Forest Model

In [None]:
shap_values = shap.TreeExplainer(rf_randomsCV.best_estimator_).shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X_train,title='Random Forest Model')

In [None]:
shap.dependence_plot('alcohol', shap_values, X_train)

In [None]:
shap.dependence_plot('sulphates', shap_values, X_train)

In [None]:
shap.dependence_plot('pH', shap_values, X_train)