# Overview 

Input Variables:
- **fixed acidity**: most acids involved with wine or fixed or nonvolatile
- **volatile acidity**: the amount of acetic acid in wine
- **citric acid**: found in small quantities, citric acid can add 'freshness' and flavor to wines
- **residual sugar**: the amount of sugar remaining after fermentation stops
- **chlorides**: the amount of salt in the wine
- **free sulfur dioxide**: the free form of $SO_2$ exists in equilibrium between molecular $SO_2$ (as a dissolved gas) and bisulfite ion
- **total sulfur dioxide**: amount of free and bound forms of $SO_2$
- **density**: the density of water is close to that of water depending on the percent alcohol and sugar content
- **pH**: describes how acidic or basic a wine is on a scale from 0 (very acidic) to 14 (very basic)
- **sulphates**: a wine additive which can contribute to sulfur dioxide gas ($SO_2$) levels
- **alcohol**: the percent alcohol content of the wine

Output Variable:
- **quality**: output variable (based on sensory data, score between 0 and 10)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

In [None]:
wine_data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', parse_dates=True, encoding = "cp1252")
wine_data.head()

In [None]:
wine_data.isnull().sum()

# Data Visualization

In [None]:
fig = plt.figure(figsize = (10,6))
sns.countplot(data=wine_data, x='quality')

- **Low Quality**: 3, 4
- **Normal**: 5, 6, 7
- **High Quality**: 8, 9

In [None]:
wine_data.loc[wine_data['quality']<=4, "Quality"]="Low Quality"
wine_data.loc[(wine_data['quality']>=5)&(wine_data['quality']<=7), "Quality"]="Normal"
wine_data.loc[wine_data['quality']>=8, "Quality"]="High Quality"

fig = plt.figure(figsize = (10,6))
sns.countplot(data=wine_data, x='Quality', order=['Low Quality','Normal','High Quality'])

In [None]:
sns.pairplot(data=wine_data, hue="quality")

In [None]:
corr = wine_data.corr(method = "pearson")
# corr = wine_data.corr(method = "spearman")
# corr = wine_data.corr(method = "kendall")

f, ax = plt.subplots(figsize=(10, 10))

sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap="coolwarm", square=True, ax=ax, annot=True, fmt=".2f")

In [None]:
def ploting(text_col):
    fig, ax = plt.subplots(nrows=3, ncols=2, sharex=False, figsize=(16,15))
    fig.suptitle(text_col.capitalize())

    sns.barplot(ax=ax[0][0], data = wine_data, x = 'quality', y = text_col, ci=None)

    sns.boxplot(ax=ax[0][1], data = wine_data, x = 'quality', y = text_col)

    sns.violinplot(ax=ax[1][0], data = wine_data, x = 'quality', y = text_col, inner=None, color=".8")
    sns.stripplot(ax=ax[1][0], data = wine_data, x = 'quality', y = text_col)

    sns.histplot(ax=ax[1][1], data=wine_data, x="fixed acidity", bins=15, kde=True, binwidth=0.5)
    
    sns.histplot(ax=ax[2][0],data=wine_data, x="quality", y=text_col, discrete=(True, False), cbar=True)
    
    sns.kdeplot(ax=ax[2][1],data=wine_data, x=text_col, hue="quality")

    fig.tight_layout()

In [None]:
ploting('fixed acidity')

In [None]:
ploting('volatile acidity')

In [None]:
ploting('citric acid')

In [None]:
ploting('residual sugar')

In [None]:
ploting('chlorides')

In [None]:
ploting('free sulfur dioxide')

In [None]:
ploting('total sulfur dioxide')

In [None]:
ploting('density')

In [None]:
ploting('pH')

In [None]:
ploting('sulphates')

In [None]:
ploting('alcohol')

# Classification

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
X = wine_data[['fixed acidity',
               'volatile acidity',
               'citric acid',
               'residual sugar',
               'chlorides',
               'free sulfur dioxide',
               'total sulfur dioxide',
               'density',
               'pH',
               'sulphates',
               'alcohol']]

y = wine_data['quality']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scl = scaler.fit_transform(X_train)
X_test_scl = scaler.transform(X_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score
from sklearn.model_selection import KFold

In [None]:
models=[("Logistic Regression",LogisticRegression()),
        ("Linear Discriminant Analysis",LinearDiscriminantAnalysis()),
        ("Decision Tree",DecisionTreeClassifier()),
        ("Random Forest",RandomForestClassifier()),
        ("Extra Trees",ExtraTreesClassifier()),
        ("Gradient Boostin",GradientBoostingClassifier()),
        ("KNeighbors",KNeighborsClassifier()),
        ("SVM",SVC()),
        ("Gaussian Naive Bayes",GaussianNB()),
        ("Ada Boost",AdaBoostClassifier())]

    
for name, model in models:
    results = cross_val_score(model, X_train, y_train.values.ravel(), cv=10, scoring='accuracy')
    print(f"\x1b[94m{name}\x1b[0m: \x1b[95m{results.mean():.4f}\x1b[0m ± {results.std():.4f}")

In [None]:
for name, model in models:
    kfold = KFold(n_splits=10)
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    print(f"\x1b[94m{name}\x1b[0m: \x1b[95m{results.mean():.4f}\x1b[0m ± {results.std():.4f}")

In [None]:
et = ExtraTreesClassifier()
et.fit(X_train, y_train)
et_predict = et.predict(X_test)

print(f"Тrain: {et.score(X_train, y_train)*100} - Тest: {et.score(X_test, y_test)*100}")

rfc_eval = cross_val_score(estimator = et, X = X_train, y = y_train, cv = 10)
print("cross_val_score: ", rfc_eval.mean()*100)

et_acc_score = accuracy_score(y_test, et_predict)
print("accuracy_score: ", et_acc_score*100)
# print(f"Тrain: {accuracy_score(y_train, et.predict(X_train))*100} - Тest: {et_acc_score*100}")

In [None]:
cr = classification_report(y_test, et_predict)
print(cr)

In [None]:
cm=confusion_matrix(y_test, et_predict)

f, ax = plt.subplots(figsize=(15, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', square=True, linewidths=0.01, linecolor='grey')
plt.title('Confustion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
feature_importance = et.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(17, 6))

plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')

In [None]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(et, X_test, y_test, n_repeats=30, random_state=0, scoring='accuracy')
sorted_idx = np.argsort(perm_importance.importances_mean)
pos = np.arange(sorted_idx.shape[0]) + .5

for i in perm_importance.importances_mean.argsort()[::-1]:
    print(f"{np.array(X_train.columns)[i]:<8} {perm_importance.importances_mean[i]:.3f} +/- {perm_importance.importances_std[i]:.3f}")


fig = plt.figure(figsize=(17, 6))

plt.barh(pos, perm_importance.importances_mean[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Permutation Importance')

In [None]:
import shap

explainer = shap.TreeExplainer(et)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
shap.summary_plot(shap_values[0], X_test)

In [None]:
shap.dependence_plot('alcohol', shap_values[0], X_test, interaction_index='sulphates')

In [None]:
shap.dependence_plot('alcohol', shap_values[0], X_test, alpha=0.5, dot_size=50, cmap=plt.get_cmap("cool"))

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X_test.iloc[0,:])

In [None]:
k_explainer = shap.KernelExplainer(et.predict_proba, X_train)
k_shap_values = k_explainer.shap_values(X_test.iloc[7])
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], X_test.iloc[7])

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0], X_test)

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][:10,:], X_test.iloc[:10,:])