In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Let's start by importing our data as well as the necessary tools

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

### Let's take a look at the data a bit and try to understand more about it

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
# Check for missing values
df.isna().sum()

### Exploratory Data Analysis

Let's take a closer look at our data in order to better understand its features and how they relate to each other and the wine quality

In [None]:
# Quality appears to increase with an increase in fixed acidity
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'fixed acidity', data = df)

In [None]:
# Quality appears to decrease with a decrease in volatile acidity
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'volatile acidity', data = df)

In [None]:
# Quality appears to increase with an increase in citric acid
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'citric acid', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'residual sugar', data = df)

In [None]:
# Quality appears to decrease with a decrease in chlorides
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'chlorides', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'free sulfur dioxide', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'total sulfur dioxide', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'density', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'pH', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'sulphates', data = df)

In [None]:
# Quality appears to increase with an increase in alcohol
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'alcohol', data = df)

### Correlation Matrix

Lets take another look at how each individual feature is related to the quality of the wine.
A Correlation Matrix computes pairwise correlation of columns and assigns numerical values based on the strength of correlation.

In [None]:
ax, fig = plt.subplots(figsize = (10, 10))
fig = sns.heatmap(df.corr(), cmap="YlGnBu", annot = True)

As can be seen from the above heatmap, certain features like alchol, sulphates, citric acid, and fixed acidity are closely correlated to our quality.
We shall look closely at those specific features using scatter plots

### Scatter plot
In order to compare the specified feature we should categorize the 'quality' feature of data as this would lead to better visualization ;)

In [None]:
## Let's take a closer look at the 'quality' feature
df.quality.unique()

In [None]:
 df.quality.value_counts()

  As can be seen from `df.quality.unique()` which returns a array of the unique values in the quality column we have 5 unique values: [5, 6, 7, 4, 8, 3].
  In order to better visualize and understand our data it is necessary to have a benchmark for what makes `good` and `bad` wine. 
  We can do this by using `pd.cut()` which allows us bin values into discrete intervals.
  For more information, visit the documentation : https://pandas.pydata.org/docs/reference/api/pandas.cut.html

In [None]:
# Let's create a copy of our data that we can manipulate freely
df_copy = df.copy()

df_copy.quality = pd.cut(df_copy.quality, bins = [2,6.5,8], labels = ['bad','good'])

In [None]:
df_copy.head()

In [None]:
# Now for the scatter plots

# Lets compare values of alcohol and volatile acidity and their values when the wine is good and bad

fig, ax  = plt.subplots(figsize = (10, 10))
ax.scatter(df_copy[df_copy.quality == "bad"]["alcohol"], df_copy[df_copy.quality == "bad"]["volatile acidity"], c = "red")
ax.scatter(df_copy[df_copy.quality == "good"]["alcohol"], df_copy[df_copy.quality == "good"]["volatile acidity"], c = "blue")
plt.xlabel("alcohol")
plt.ylabel("volatile acidity");

Certain values fall outside their expected region.
These are known as outliers; outliers are values which are different from other values usually in a specified range.
For more information : https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba
,

In [None]:
# Lets compare values of citric acid and sulphates and their values when the wine is good and bad

fig, ax  = plt.subplots(figsize = (10, 10))
ax.scatter(df_copy[df_copy.quality == "bad"]["citric acid"], df_copy[df_copy.quality == "bad"]["sulphates"], c = "red")
ax.scatter(df_copy[df_copy.quality == "good"]["citric acid"], df_copy[df_copy.quality == "good"]["sulphates"], c = "blue")
plt.xlabel("alcohol")
plt.ylabel("volatile acidity");

### Outliers

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

Now the outliers are gone we can start working on our model.
For performance reason i will use`df_copy` to create our model, despite the fact it contains outliers.
You may choose to do otherwise.

### Model

In [None]:
# Lets split out data into independent and target variables (X & y)

X = df_copy.drop('quality', axis = 1)
y = df_copy['quality']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=19)

print("X_train shape: ",X_train.shape)
print("X_test shape: ",X_test.shape)
print("y_train shape: ",y_train.shape)
print("y_test shape: ",y_test.shape)



Let's try out three models first then pick the one with the highest accuracy

In [None]:
models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "SVC": SVC()
}


def model_eval(models,X_train,X_test,y_train,y_test):
    """
    Fit and evaluates given machine learning models.
    models : Dictionary of machine learning models
    X_train : training data (no labels)
    X_test : testing data (no labels)
    y_train : training labels
    y_test : test labels
    """
    np.random.seed(19) # for reproducable results
    # Make a dict to keep model scores
    model_scores = {}
    
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
model_eval(
    models = models,
    X_train = X_train,
    X_test = X_test,
    y_train = y_train,
    y_test = y_test
)

Lets take a closer look at the Random Forest Classifier and how well it performs with our data