# Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

# SKLEARN Modules
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier as knn 
from sklearn.model_selection import train_test_split as tts
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Setting Graph styles to seaborn
plt.style.use("seaborn")
%matplotlib inline

# OS Module
import os

# If you like the notebook, please do leave an upvote

# Introduction

![](https://assets.entrepreneur.com/content/3x2/2000/20190212131344-wine-541922.jpeg)

*'Red is a type of wine made from dark-colored grape varieties. The actual color of the wine can range from intense violet, typical of young wines, through to brick red for mature wines and brown for older red wines. The juice from most purple grapes is greenish-white, the red color coming from anthocyan pigments (also called anthocyanins) present in the skin of the grape; exceptions are the relatively uncommon teinturier varieties, which produce a red-colored juice. Much of the red-wine production process therefore involves extraction of color and flavor components from the grape skin. It is a delicacy around the world.*

In [None]:
# Random Color Generator (length) => (list of colors randomly generated)
def random_colors(length=1):
    hexnum = "0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f".split(",")
    codes = []
    for i in range(length):
        code = []
        for i in range(6):
            code.append(hexnum[np.random.randint(0,15)])
        code = "".join(code)
        codes.append("#" + code)
    return codes

In [None]:
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv") # Load the Dataset
df.columns = [re.sub("\s+","_",x) for x in list(df.columns)] # Remove Spaces from the columns
features = list(df.columns)[:-1] # 'features' is list of all features 
print("Total Number of NULL VALUES:",df.isnull().sum().sum()) # Check for Null Values
df.info() # General Info about the Dataset

*There are no null values in the dataset, all the features are of type 'float' while the target "quality" is of type 'int'*

* If Quality >= 7       **High Quality**
* If Quality in [5,6]   **Average Quality**
* If Quality <= 4       **Low Quality**

In [None]:
# Adding a 'labels' feature that denotes quality of wine
df["labels"] = df["quality"].copy()
df["labels"].replace({3:"low",4:"low",5:"avg",6:"avg",7:"high",8:"high"},inplace=True)
high = df.where(df["labels"] == "high").dropna().reset_index(drop=True)
avg = df.where(df["labels"] == "avg").dropna().reset_index(drop=True)
low = df.where(df["labels"] == "low").dropna().reset_index(drop=True)
df.head()

# DATA

* **Fixed Acidity** : Measurement of the total concentration of titratable acids and free hydrogen ions present in your wine.

* **Volatile Acidity** : Volatile acidity could be an indicator of spoilage, or errors in the manufacturing processes — caused by things like damaged grapes, wine exposed to air, and so on.

* **Citric Acid** : Citric acid is generally found in very small quantities in wine grapes. It acts as a preservative and is added to wines to increase acidity, complement a specific flavor or prevent ferric hazes. Excess addition, however, can ruin the taste.

* **Residual Sugar** : Any natural grape sugars that are leftover after fermentation ceases (whether on purpose or not). 

* **Chlorides** : The amount of chlorides present in a wine is usually an indicator of its “saltiness.”Too much saltiness is considered undesirable. The right proportion can make the wine more savory. 

* **Sulfur-Di-Oxide**: Sulfur dioxide exists in wine in free and bound forms, and the combinations are referred to as total SO2. It’s the most common preservative used, usually added by wine makers to protect the wine from negative effects of exposure to air and oxygen. It acts as a sanitizing agent — adding it usually kills unwanted bacteria or yeast that might enter the wine and spoil its taste and aroma.

* **Density** : Used to measure the alcohol concentration in wines. Sweeter wines generally have higher densities.

* **pH** : pH stands for power of hydrogen, which is a measurement of the hydrogen ion concentration in the solution. Sulfates are salts of sulfuric acid.

* **Sulphates** : They aren’t involved in wine production, but some beer maker—to correct mineral deficiencies in water during the brewing process. It also adds a bit of a “sharp” taste.

* **Alcohol** : Drinking it in small amounts gives you warm fuzzy feelings inside, and makes you more sociable. Of-course, higher doses can also make you pass out.

# ANALYSIS

## Data Analysis

In [None]:
codes = {
    "fixed_acidity":"fa",
    "volatile_acidity": "va",
    "citric_acid":"ca",
    "residual_sugar":"rs",
    "chlorides":"chl",
    "free_sulfur_dioxide":"fsd",
    "total_sulfur_dioxide":"tsd",
    "density":"sg",
    "pH":"pH",
    "sulphates":"sul",
    "alcohol":"alch",
    "quality":"q"
}
codes_inv = dict(map(reversed, codes.items()))
desc = pd.DataFrame()
for dat,name in zip([df,high,low,avg],["all","high","low","avg"]):
    tmp = dat.describe()
    tmp.columns = [codes[x] +"_" + name for x in list(tmp.columns)]
    tmp = tmp.transpose().drop(["count"],axis=1).round(2)
    desc = desc.append(tmp)

In [None]:
def plot_ct(i):
    plt.figure(figsize=(30,10))
    n = 0
    for j in ["mean","std","min","max"]:
        n+=1
        plt.subplot(2,2,n)
        plt.subplots_adjust(hspace=0.4)
        ax = sns.barplot(data=desc.loc[[i+"_all",i+"_high",i+"_low",i+"_avg"]],x=["fa_all","fa_high","fa_low","fa_avg"],y=j)
        plt.title(codes_inv[i] +" "+ j,fontdict={'fontsize': 24},)
        ax.tick_params(labelsize=14)
    plt.show()

In [None]:
plot_ct("fa")

**There is not a lot of decisive factors in fixed acidity**
* It looks like Low Quality wines have slightly lower levels of fixed acidity.

In [None]:
plot_ct("va")

> It is clear here that Low Quality Wines have high levels of volatile acidity, this is only natural .

    Volatile acidity could be an indicator of spoilage, or errors in the manufacturing processes — caused by things like damaged grapes, wine exposed to air, and so on. 
    This causes acetic acid bacteria to enter and thrive, and give rise to unpleasant tastes and smells.

> High Quality Wines have low levels of Volatile Acidity.

> Volatile Acidity Levels are inversely propotional to the quality

In [None]:
plot_ct("ca")

> Wine Makers can choose to either add or avoid Citric Acid, so minimum values are zero.

    Citric acid is generally found in very small quantities in wine grapes.
    It acts as a preservative and is added to wines to increase acidity, complement a specific flavor or prevent ferric hazes.
    It can be added to finished wines to increase acidity and give a “fresh” flavor. Excess addition, however, can ruin the taste.

> Explains why most Low Quality wines don't have added citric acid, and high quality wines have a lower maximum citric acid value compared to the others.
  This is because too much ruins the taste, High quality wines have the perfect amount of citric acid in them.

> Mathematically, note that the Coefficient of Variance will be higher for Low Quality Wines, (std/mean)
  indicating that they are very randomly varying, between 0 to 1.
  Either being too low or too high both cases being bad for the wine.

> High Quality wines have a very low value of Coefficient of Variance, indicating higher consistency, of around 0.35 to 0.65

In [None]:
plot_ct("rs")

    Residual Sugar, or RS for short, refers to any natural grape sugars that are leftover after fermentation ceases (whether on purpose or not).
    The juice of wine grapes starts out intensely sweet, and fermentation uses up that sugar as the yeasts feast upon it.
    During winemaking, yeast typically converts all the sugar into alcohol making a dry wine. However, sometimes not all the sugar is fermented by the yeast,
    leaving some sweetness leftover.
    To make a wine that tastes good, the key is to have a perfect balance between the sweetness and the sourness in the drink.

> The Data on close analysis shows that the Low Quality Wines might be oversweet, due to excess Residual Sugars.
    All central tendancies are higher than other for Low Quality wines.

> A Residual Sugar quantity range of 1.2 to 9 will help attain the balance if sweetness and sourness in the wine, making it High Quality.

In [None]:
plot_ct("chl")

> Chlorides look like, they worsen the taste or add unpleasant elements to the wine.

> Low Quality Wines have higher levels of Chlorides.

    The amount of chlorides present in a wine is usually an indicator of its “saltiness.” This is usually influenced by the territory where the wine grapes grew, cultivation methods, and also the grape type. Too much saltiness is considered undesirable. The right proportion can make the wine more savory.

> High Quality Wines have chloride levels in the range of 0.01 to 0.35, unlike the low quality ones that have excessive amounts. 

In [None]:
plot_ct("fsd")

> Average Wines, have higher levels of Free Sulphur Dioxide, Low Quality Ones have the lower levels. High Quality wines however, have this is in the right propotions, in the range of 3 to 55.

In [None]:
plot_ct("sul")

    Sulfates are salts of sulfuric acid. They aren’t involved in wine production, but some beer makers use calcium sulfate — also known as brewers’ gypsum — to correct mineral deficiencies in water during the brewing process. It also adds a bit of a “sharp” taste.
 
> The sharp taste cause due to higher levels of CaSO4 in the range of 0.4 to 1, gives it a sharp taste, enhancing the feel.

In [None]:
plot_ct("alch")

> High Quality wines seem to have higher levels of alcohol, though not extremely high.

In [None]:
# Checking Data Distribution
fig = px.pie(df, values='quality',names='quality',hover_data=['quality'])
fig.show()
print("\033[1m THE CLASSES [3,4,7,8] ARE UNDER-SAMPLED \033[0m \n")
plt.figure(figsize=(25,6))
sns.countplot(data=df,x="labels")
plt.show()

***The dataset contains a total of 1599 wine [feature,quality] pairs, out of these though, many are of average quality, some are of high quality, and very few are of low quality, the dataset is imbalanced.***

In [None]:
fig = go.Figure()
colors = random_colors(11)
for i,color in zip(features,colors):
    fig.add_trace(go.Box(
        y = df[i],
        x = df["labels"],
        name=i,
        marker_color= color
    ))
fig.update_layout(
    xaxis=dict(title='Box-Plot', zeroline=False),
    boxmode='group'
)
fig.show()

In [None]:
g = sns.pairplot(df.where(df["labels"] != "avg").dropna().drop(["quality"],axis=1),
                 hue="labels",
                 diag_kind="kde",
                 palette='Dark2',
                 markers=["o","s"])
g.map_lower(sns.kdeplot,levels=2,color=".2")
plt.show()

In [None]:
plt.figure(figsize=(16,10))
mask = np.triu(np.ones_like(df.corr(), dtype=np.bool))
sns.heatmap(df.corr(),cmap='BrBG',annot=True,vmin=-1, vmax=1,mask=mask)
plt.show()

In [None]:
set1 = ["citric_acid","density","pH","citric_acid","citric_acid","free_sulfur_dioxide","density"]
set2 = ["fixed_acidity","fixed_acidity","fixed_acidity","volatile_acidity","pH","total_sulfur_dioxide","alcohol"]

In [None]:
sns.jointplot(data=df, x=set1[0], y=set2[0], kind="hist",hue="labels")
plt.show()

> If Both Citric Acid and Fixed Acidity levels are low, then the wine is rated low quality

> If Both of these are high, the quality is mostly either Average or High.

    High if these levels are neither too high nor too low

    In order to produce good quality wine, citric acid levels must be 0.1+

    In order to produce high quality wine, it must follow the above criterion and additionally have fixed acid levels in range of 5 to 11, 
    as long as the citric acid level is in range of 0.3 to 0.7

In [None]:
ax = sns.jointplot(data=df, x=set1[1], y=set2[1], kind="hist",hue="labels")
plt.show()
print("""
If the density of the wine is higher, the fixed acidity must be increased propotionally.
A plot of only high quality wines to find propotionality constant.
""")
lr = LinearRegression()
lr.fit(high["fixed_acidity"].values.reshape(-1,1),high["density"].values.reshape(-1,1))
print(f"Fixed Acidity = {lr.coef_[0][0]:.6f}*Density + {lr.intercept_[0]:.3f} this equation is true for High Quality Wines")

# Model

In [None]:
# A Function to evaluate the results
def evaluate(model,X_test,y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    print(f"Accuracy Score: {metrics.accuracy_score(y_test,y_pred)}")
    print(metrics.classification_report(y_test,y_pred,zero_division=1))
    print(f"ROC AUC SCORE: {metrics.roc_auc_score(y_test,y_proba, multi_class='ovr')}")

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train,X_test,y_train,y_test = tts(X,y,test_size = 0.33,random_state = 0)

# Create Pipeline 
pipe = Pipeline([
        ('scale', StandardScaler()),
        ('reduce_dims',PCA()),
        ('model', knn())])

param_grid = dict(model__n_neighbors=range(10),reduce_dims__n_components=[2,4,5,6])

# Grid Search
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=3,scoring='accuracy')
grid.fit(X_train,y_train)
print(f"Best Score : {grid.best_score_}")
print(f"Best Parameters : {grid.best_params_}")

In [None]:
evaluate(grid,X_test,y_test) # This is pure test data, UNTOUCHED

In [None]:
plt.figure(figsize=(25,10))
for i in ["split0_test_score","split1_test_score","split2_test_score"]:
    line = plt.plot(grid.cv_results_[i],"ro")
    plt.legend()
plt.plot(grid.cv_results_["mean_test_score"])
plt.show()

In [None]:
plt.figure(figsize=(20,8))
px.scatter(df,y="alcohol",x="volatile_acidity",color="quality")

# Linear Regression

In [None]:
X = df.drop(["labels"],axis=1).iloc[:,:-1]
y = df.drop(["labels"],axis=1).iloc[:,-1]
X_train,X_test,y_train,y_test = tts(X,y,test_size = 0.33,random_state = 0)

# Linear Regression
clf = LinearRegression()
clf.fit(X_train,y_train)
print(f"Mean Squared Error of Linear Regression model = {metrics.mean_squared_error(clf.predict(X_test),y_test)}")

Linear Regression will give us better results, when compared to classification using KNN

# [Streamlit App](https://github.com/saai-sudarsanan-d/Red-Wine-Quality-App)