In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Mapping the "quality" column from 0 to 6

In [None]:
import pandas as pd

df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
df["quality"].describe()

In [None]:
quality_mapping = {
    3: 0,
    4: 1,
    5: 2,
    6: 3,
    7: 4,
    8: 5,
}

In [None]:
df.loc[:, "quality"] = df.quality.map(quality_mapping)

# 2. Splitting the dataset into a "train" and a "test" set

In [None]:
# Shuffling the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Selecting the first 1,000 rows for "train"
df_train = df.head(1000)

# Selecting the remaining 599 rows for "test"
df_test = df.tail(599)

# 3. Training a Decision Tree model

In [None]:
from sklearn import tree
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

clf = tree.DecisionTreeClassifier(max_depth=7)
cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']

clf.fit(df_train[cols], df_train.quality)

# 4. Testing the accuracy of the model

In [None]:
# Predictions on the train set
train_predictions = clf.predict(df_train[cols])
# Predictions on the test set
test_predictions = clf.predict(df_test[cols])
# Calculating accuracy
train_accuracy = metrics.accuracy_score(df_train["quality"], train_predictions)
# Calculating accuracy of predictions on test set
test_accuracy = metrics.accuracy_score(df_test["quality"], test_predictions)

### 4.1 Creating a plot with different values for "max_depth"

In [None]:
matplotlib.rc("xtick", labelsize=20)
matplotlib.rc("ytick", labelsize=20)

train_accuracies = [0.5]
test_accuracies = [0.5]

for depth in range(1, 25):
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
    
    # Train model
    clf.fit(df_train[cols], df_train.quality)
    # Create train and test predictions
    train_predictions = clf.predict(df_train[cols])
    test_predictions = clf.predict(df_test[cols])
    # Calculate training and test accuracies
    train_accuracy = metrics.accuracy_score(df_train["quality"], train_predictions)
    test_accuracy = metrics.accuracy_score(df_test["quality"], test_predictions)
    # Append accuracies
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
    
# Create plots
plt.figure(figsize=(10, 5))
sns.set_style("whitegrid")
plt.plot(train_accuracies, label="train accuracy")
plt.plot(test_accuracies, label="test accuracy")
plt.legend(loc="upper left", prop={"size" : 15})
plt.xticks(range(0, 26, 5))
plt.xlabel("max_depth", size=20)
plt.ylabel("accuracy", size=20)
plt.show()

### => Use a "max depth" of 11 for the DecisionTreeClassifier to achieve highest accuracy

In [None]:
b = sns.countplot(x="quality", data=df)
b.set_xlabel("quality", fontsize=20)
b.set_ylabel("count", fontsize=20)

### => Choose a stratified k-fold because of unequal distribution!