<h1><center>Red Wine Quality</center></h1>

<center><img src="https://nomansgrace.com/wp-content/uploads/2020/06/176361.jpg"></center>

# **Introduction**
The two datasets are related to red and white variants of the Portuguese "Vinho Verde" wine. For more details, consult the reference [Cortez et al., 2009]. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).

These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are much more normal wines than excellent or poor ones).

## Data

For more information, read [Cortez et al., 2009].

Input variables (based on physicochemical tests):

1. fixed acidity

2. volatile acidity

3. citric acid

4. residual sugar

5. chlorides

6. free sulfur dioxide

7. total sulfur dioxide

8. density

9. pH

10. sulphates

11. alcohol

Output variable (based on sensory data):

12. quality (score between 0 and 10)

## Content

1. [Import data and python packages](#t1.)
    * Import packages
    * Import data
    * Data shape and info
2. [Data visualization](#t2.)
    * Kde plots
    * Box plots
    * Heatmap(Correlation)
3. [Classification](#t3.)

    3.1 [Split data for train and test](#t3.1)
    
    3.2 [Functions for models](#t3.2)
    
    3.3 [Models](#t3.3)
    
4. [Cross Validation](#t4.)

<a id="t1."></a>
# 1. Import data and python packages

In [None]:
pip install mglearn

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import mglearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
df = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
print(df.shape)
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df["quality cut"] = pd.cut(df["quality"],[2,5,8],labels=["bad","good"])

<a id="t2."></a>
# 2. Data visualization

In [None]:
def KdeAndBox(at1,at2):
    plt.figure(figsize=(14,9))
    plt.subplot(2,2,1)
    sns.kdeplot(df.loc[df["quality cut"]=="bad"][at1],shade=True)
    sns.kdeplot(df.loc[df["quality cut"]=="good"][at1],shade=True)

    plt.legend(["bad","good"])
    plt.title(at1.upper(),fontsize=15)
    plt.subplot(2,2,2)
    sns.kdeplot(df.loc[df["quality cut"]=="bad"][at2],shade=True)
    sns.kdeplot(df.loc[df["quality cut"]=="good"][at2],shade=True)
    plt.legend(["bad","good"])
    plt.title(at2.upper(),fontsize=15)
    plt.subplot(2,2,3)
    sns.boxplot(data=df,y=at1,x="quality cut")
    plt.subplot(2,2,4)
    sns.boxplot(data=df,y=at2,x="quality cut")
    plt.show()

In [None]:
KdeAndBox("fixed acidity","volatile acidity")

In [None]:
KdeAndBox("citric acid","alcohol")

In [None]:
KdeAndBox("chlorides","density")

In [None]:
KdeAndBox("total sulfur dioxide","free sulfur dioxide")

In [None]:
KdeAndBox("pH","sulphates")

In [None]:
plt.figure(figsize=(14,4.5))
plt.subplot(1,2,1)
sns.kdeplot(df.loc[df["quality cut"]=="bad"]["residual sugar"],shade=True)
sns.kdeplot(df.loc[df["quality cut"]=="good"]["residual sugar"],shade=True)

plt.legend(["bad","good"])
plt.title("residual sugar".upper(),fontsize=15)
plt.subplot(1,2,2)
sns.boxplot(data=df,y="residual sugar",x="quality cut")
plt.show()

In [None]:
df_pc = pd.DataFrame()
df_pc["fixed acidity"]=df["fixed acidity"].copy()
df_pc["residual sugar"]=df["residual sugar"].copy()
df_pc["alcohol"]=df["alcohol"].copy()
df_pc["free sulfur dioxide"]=df["free sulfur dioxide"].copy()
df_pc["quality"]=df["quality"].copy()

fig = px.parallel_coordinates(df_pc, color="quality",
                    color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=5.5)
fig.show()

In [None]:
s = 1
for i in df.columns:
    if i=="quality" or i=="quality cut":
        pass
    else:
        plt.subplot(6,2,s)
        df.groupby(["quality cut"])[i].mean().plot(figsize=(15,15),lw=2.5,ls="--",marker="s",color="teal")
        plt.title("Avg of {}".format(i),fontsize="13")
        plt.grid()
        s += 1
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(df.corr(),annot=True,fmt=".0%")
plt.show()

<a id="t3."></a>
# 3. Classification

In [None]:
mglearn.plots.plot_grid_search_overview()
plt.show()

<a id="t3.1"></a>
## 3.1 Split data for train and test

In [None]:
df["quality cut"].replace(["bad","good"],[0,1],inplace=True)

In [None]:
X = df.drop(["quality","quality cut"],axis=1) 
y = df["quality cut"]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.35,random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

<a id="t3.2"></a>
## 3.2 Functions for models

In [None]:
from sklearn.model_selection import cross_val_score
def CrossValidationScore(model_list):
    global X,y
    
    mean_cross_val_score = []
    model_name           = []
    
    for model in model_list:
        model_name.append(type(model).__name__)
        
    for i in model_list:
        scores = cross_val_score(i, X, y, cv=5)
        mean_cross_val_score.append(scores.mean())
        
    cvs = pd.DataFrame({"Model Name":model_name,"CVS":mean_cross_val_score})
    return cvs.style.background_gradient("Greens")

<a id="t3.3"></a>
## 3.3 Models

In [None]:
from yellowbrick.classifier import ROCAUC,ConfusionMatrix

In [None]:
classes = ["bad","good"]

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train,y_train)

visualizer = ROCAUC(lr, classes=classes)
visualizer.fit(X_train, y_train)      
visualizer.score(X_test, y_test)        
visualizer.show();

plt.figure(figsize=(3,3))
cm = ConfusionMatrix(lr, classes=classes)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)
plt.xticks(rotation=0)
cm.show();

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)

visualizer = ROCAUC(gnb, classes=classes)
visualizer.fit(X_train, y_train)      
visualizer.score(X_test, y_test)        
visualizer.show();

plt.figure(figsize=(3,3))
cm = ConfusionMatrix(gnb, classes=classes)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)
plt.xticks(rotation=0)
cm.show();

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(64,128,64),max_iter=10000,solver="adam",activation="relu")
mlp.fit(X_train,y_train)

visualizer = ROCAUC(mlp, classes=classes)
visualizer.fit(X_train, y_train)      
visualizer.score(X_test, y_test)        
visualizer.show();

plt.figure(figsize=(3,3))
cm = ConfusionMatrix(mlp, classes=classes)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)
plt.xticks(rotation=0)
cm.show();

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train,y_train)

visualizer = ROCAUC(rfc, classes=classes)
visualizer.fit(X_train, y_train)      
visualizer.score(X_test, y_test)        
visualizer.show();

plt.figure(figsize=(3,3))
cm = ConfusionMatrix(rfc, classes=classes)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)
plt.xticks(rotation=0)
cm.show();

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)

visualizer = ROCAUC(xgb, classes=classes)
visualizer.fit(X_train, y_train)      
visualizer.score(X_test, y_test)        
visualizer.show();

plt.figure(figsize=(3,3))
cm = ConfusionMatrix(xgb, classes=classes)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)
plt.xticks(rotation=0)
cm.show();

<a id="t4."></a>
# 4. Cross Validation

In [None]:
mglearn.plots.plot_cross_validation();
plt.show()

In [None]:
model_list=[lr,gnb,mlp,rfc,xgb]

In [None]:
CrossValidationScore(model_list)