In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
df.info()

In [None]:
# Check for Null Values
null_values = df.isnull().sum()[df.isnull().sum() > 0]
len(null_values)

In [None]:
df.describe()

In [None]:
df.groupby(['quality']).mean()

In [None]:
sns.barplot(x=df['quality'].unique(), y=df['quality'].value_counts())
plt.ylabel('Count')
plt.xlabel('Wine Quality')
plt.title('Number of wines for each quality category');

In [None]:
sns.boxplot(y=df['quality'].values)
plt.title('Distribution of Wine Quality')
plt.ylabel('Quality Category')
plt.axhline(np.quantile(df['quality'].values, 0.75), 1, 0, color='red', linewidth=5, label='0.75 quantile')
plt.legend();

In [None]:
# Everything above the 0.75 quantile is considered to be good quality (quality=1) while everything below is considered being rather bad quality (quality=0)
border = np.quantile(df['quality'].values, 0.75)
df['quality'] = [1 if i>border else 0 for i in df['quality']]
df['quality'].head().to_frame()

# Train & Test Data

In [None]:
X_data = df.drop(['quality'], axis=1).values
y_data = df['quality'].values

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_data)
X_data = scaler.transform(X_data.astype(float))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

# K-Nearest-Neighbor

In [None]:
# Create K-Nearest Neighbour and calculate accuracy for different 'Ks'
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Ks = 50
mean_acc = np.zeros((Ks-1))

for n in range(1, Ks):
    # Train Model and Predict
    clm = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
    yhat = clm.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

best_K = mean_acc.argmax()+1
print('Maximum Accuracy achieved:', mean_acc.max(), 'at', best_K)

In [None]:
plt.plot(range(1, Ks), mean_acc)
plt.title('Accuracy for each K-Nearest-Neighbour')
plt.xlabel('No. of nearest Neighbours')
plt.ylabel('Accuracy');

In [None]:
# Evaluate Model with a distplot
def evaluate_distplot(y_test, yhat):
    """
    y_test: Test-Data (Target)
    yhat: Predicted Data (Target) 
    """
    y_test_std = y_test.std()
    y_test_mean = y_test.mean()
    y_test_median = np.median(y_test)
    sns.set_style("darkgrid")
    sns.kdeplot(y_test, label='Actual Value')
    sns.kdeplot(yhat, label='Predicted Value')
    plt.axvline(y_test_mean, 1, 0, color='black', label='Mean of Test-Data')
    plt.axvline(y_test_mean+y_test_std, 1, 0, color='grey')
    plt.axvline(y_test_mean-y_test_std, 1, 0, color='grey', label='+/- 1 standard deviation')
    plt.axvline(y_test_median, 1, 0, color='darkblue', label='Median of Test-Data')
    plt.legend()
    plt.title('Test Data Prediction vs. Actual Value')
    plt.show()

In [None]:
clm = KNeighborsClassifier(n_neighbors=best_K).fit(X_train, y_train)
yhat = clm.predict(X_test)
evaluate_distplot(y_test, yhat);

# Decision Tree Classification

In [None]:
from sklearn import tree

In [None]:
clm_tree = tree.DecisionTreeClassifier(criterion="entropy")
clm_tree.fit(X_train, y_train)
pred_tree = clm_tree.predict(X_test)
print("Decision Trees's Accuracy:", metrics.accuracy_score(y_test, pred_tree))

In [None]:
# Extract Features & Class Names
feature_names = df.drop(['quality'], axis=1) # get feature columns
feature_names = list(feature_names.columns) # get feature columns

class_names = list(df['quality'].value_counts().index) # get classification names
class_names = list(map(str, class_names))
class_names.sort()

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=600)
a = tree.plot_tree(clm_tree, feature_names=feature_names, class_names=class_names, filled=True);
fig.savefig('decision_tree.png');
print('Export Done')

In [None]:
evaluate_distplot(y_test, pred_tree)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

n = 10
mean_acc = np.zeros((n))

for i in range(0, n):
    clm_rf = RandomForestClassifier()
    clm_rf.fit(X_train, y_train)
    yhat = clm_rf.predict(X_test)
    acc = metrics.accuracy_score(y_test, yhat)
    mean_acc[i] = acc
    print(f'{i+1}/{n} done: {round(acc, 4)}')

print(f"Mean Accuracy after {n} runs: {mean_acc.mean()}")

In [None]:
evaluate_distplot(y_test, yhat)

<h1>SUMMARY</h1>
<p>Different classification methodologies had different accuracies:</p>
<li>Nearest Neighbor: ~0.89 with K = 2</li>
<li>Decision Tree: ~0.88</li>
<li>Random Forest: ~0.9 (average after 10 runs)</li>
<br>
<p>Based on the results, we would chose the Random Forest Classificator</p>