In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

First: we import the necessary datasets and put the data in wine

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
wine = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

wine.head()

To make it easier for us to understand, we could bin the quality into a binary.

Bin1: (0,6] = Low Quality Wine
Bin2: (6,10] = High Quality Wine

We do this with pandas.cut

In [None]:
bins = [0,6,10]
labels = [0,1]

wine['quality'] = pd.cut(wine['quality'], bins=bins, labels=labels)

wine

With these, we can now do an EDA

<h1> Exploratory Data Analysis </h1>


1. What are our initial thoughts of the data?

In [None]:
wine.head()

Observations:

<li>The Density, pH, sulphates don't seem to give off much information since their data points are very close to each other</li>

To see this, we would call the .describe() function

In [None]:
wine.describe()

Looking at the density column, we see that the standard deviation of the values is near 0. From that, we could essentially remove it and the result wont change much.

In [None]:
wine.drop('density', axis=1, inplace=True)

To analyze the rest of the data further, we can use box-and-whisker plots to see how high and low quality wines differ

In [None]:
from sklearn import preprocessing

to_scale = wine.drop('quality', axis=1).columns

wine[to_scale] = preprocessing.scale(wine[to_scale])

wine

In [None]:
#Box plot

sns.boxplot(x=wine['quality'], y=wine['alcohol'])

plt.xlabel('quality')
plt.ylabel('alcohol')

plt.show()

sns.boxplot(x=wine['quality'], y=wine['sulphates'])

plt.xlabel('quality')
plt.ylabel('sulphates')

plt.show()

sns.boxplot(x=wine['quality'], y=wine['total sulfur dioxide'])

plt.xlabel('quality')
plt.ylabel('total sulfer dioxide')

plt.show()

sns.boxplot(x=wine['quality'], y=wine['free sulfur dioxide'])

plt.xlabel('quality')
plt.ylabel('free sulpher dioxide')

plt.show()

sns.boxplot(x=wine['quality'], y=wine['chlorides'])

plt.xlabel('quality')
plt.ylabel('chlorides')

plt.show()

sns.boxplot(x=wine['quality'], y=wine['citric acid'])

plt.xlabel('quality')
plt.ylabel('citric acid')

plt.show()

sns.boxplot(x=wine['quality'], y=wine['volatile acidity'])

plt.xlabel('quality')
plt.ylabel('volatile acidity')

plt.show()

sns.boxplot(x=wine['quality'], y=wine['fixed acidity'])

plt.xlabel('quality')
plt.ylabel('fixed acidity')

plt.show()

sns.boxplot(x=wine['quality'], y=wine['residual sugar'])

plt.xlabel('quality')
plt.ylabel('residual sugar')

plt.show()

From these box-and-whisker plots, we can see interesting patterns that can help with classification.

1. The Quality of the wine has a significant difference in its alcohol content. The Higher the Alcohol content, the higher the quality of the wine
2. The Quality of the wine has a significant difference in its cirtic acid content. The Higher the Citric acid content, the higher the quality of the wine
3. The rest of the features don't really affect the wine quality that much because most of them are blended and a classification is hard to see.

The observations make sense. When the wine has a strong punch, they tend to be liked more. When the wine is sweeter and fruity, they tend to be more liked.

Following this, we can keep only the alcohol, citric acid features to predict the quality of the wine.

In [None]:
wine = wine[['alcohol', 'citric acid', 'quality']]

wine

We can now save this into a CSV for when a model were to use it.

In [None]:
wine.to_csv('red_wine_processed.csv')

<h1>Prediction with KNNClassifier</h1>

Import the needed modules

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Do the training and prediction

In [None]:
X = wine.drop('quality', axis=1)
y = wine['quality']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=0
)

#Define the Model
model = KNeighborsClassifier(n_neighbors=2) #It would be nice to have only 2 to reference it to.

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

#Evaluate

score = accuracy_score(y_test, y_pred)

print("Accuracy Score: ", score)

From this, we see that we have created a KNN Classifier with a 90% Accuracy Rate