In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier # to build a classification tree 
from sklearn.tree import plot_tree #to draw a classification tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score #for cross validation
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import plot_confusion_matrix

In [None]:
df=pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df.head()

In [None]:
df.dtypes

All are float or integer datatype chance of missing value entered with random number is checked.

In [None]:
df.isna().sum()

All the columns are filled so we can confirm that our dataset does not have missing values.

In [None]:
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
sns.boxplot(x='quality',y='free sulfur dioxide',data=df)

In [None]:
sns.boxplot(x='quality',y='total sulfur dioxide',data=df)

In [None]:
# dropping fixed acidity, free sulfur dioxide
df=df.drop(['fixed acidity','free sulfur dioxide'],axis=1)

In [None]:
sns.lineplot(x='quality',y='volatile acidity',data=df)

we can infer a trend in volatile acidity which tend to decrese quality as it increases

In [None]:
X=df.iloc[:,1:9]
Y=df.iloc[:,9]

In [None]:
Y.unique()

To get the binary output we need convert good for Y>6.5 and else bad quality.

In [None]:
Y.values[Y.values < 6.5] = 0

In [None]:
Y.values[Y.values > 6.5] = 1

You can use apply with list comprehension:
Y = Y.apply(lambda x: [0 if y <= 6.5 else 1 for y in x])

In [None]:
Y.unique()

In [None]:
Y.value_counts()

Splittting data in training and testing

In [None]:
sns.histplot(data=Y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Creating a decision tree and fitting it into a training set

In [None]:
DC=DecisionTreeClassifier(random_state=42)
model=DC.fit(X_train,y_train)

In [None]:
plt.figure(figsize=(45,45))
plot_tree(model,filled=True,rounded=True,feature_names=X_train.columns,class_names=["Bad","Good"])
plt.show()

In [None]:
plot_confusion_matrix(model,X_test,y_test,display_labels=["Bad","Good"])

In the confusion matrix, we can see that 251+22 = 273 wine are bad in taste of which 128 are correctly classified. And of the 22+25 = 22 wine are good of which 10 are correctly classified. 
It seem that classification tree is overfit, and require pruning

Cost Complexity Pruning is the action of selectively removing certain parts of a trees to promote its growth. Reducing the number of leaf nodes may result in slightly worse training accuracy, but greatly improve testing performance. Alpha is a pruning parameter, and the higher the alpha, the more nodes will be pruned.

In [None]:
path = model.cost_complexity_pruning_path(X_train, y_train) # Determine values for alpha
ccp_alphas = path.ccp_alphas                                 # extract different values for alpha
ccp_alphas = ccp_alphas[:-1]                                 # Exclude the maximum values

In [None]:
clf_dts = []   ## Create an array to put decision trees in

In [None]:
## Create one decision tree per alpha and store in array
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf_dt.fit(X_train, y_train)
    clf_dts.append(clf_dt)

In [None]:
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

In [None]:

fig, ax = plt.subplots()
ax.set_xlabel('alpha')
ax.set_ylabel('accuracy')
ax.set_title('Accuracy vs alphas for training and testing sets')
ax.plot(ccp_alphas, train_scores, marker='o', label='train', drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label='test', drawstyle="steps-post")
ax.legend()
plt.show()

from graph, a good value for alpha might be 0.01 as the tesing accuracy is the highest at this value.

In [None]:
DT = DecisionTreeClassifier(random_state=42, ccp_alpha=0.01)
A=cross_val_score(DT,X_train,y_train,scoring='accuracy',cv=5).mean()
A

Finding best alpha

In [None]:
alpha_loop_values = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
    alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                            columns=['alpha', 'mean_accuracy', 'std'])
alpha_results.plot(x='alpha', y='mean_accuracy', yerr='std', marker='o', linestyle='--')

In [None]:
best_alpha = alpha_results.iloc[alpha_results['mean_accuracy'].idxmax(),]['alpha']

In [None]:
best_alpha

Evaluating Final Model

In [None]:
clf_dt_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha)
clf_dt_pruned.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(clf_dt_pruned,X_test,y_test,display_labels=["Bad","Good"])

In [None]:
plt.figure(figsize=(15, 7.5))
plot_tree(clf_dt_pruned, 
         filled=True,
         rounded=True,
         class_names=["Bad","Good"],
         feature_names=X_train.columns);

This is the final tree

In [None]:
# y_pred=model.predict(X_test)

In [None]:
# importance=DC.feature_importances_
# plt.bar([x for x in range(len(importance))], importance)