In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

In [2]:
data = pd.read_csv("winequality-red.csv")
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
data.isna().any()

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [4]:
data["quality"].unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [5]:
quality_ids = {5:0,6:1,7:2,4:3,8:4,3:5}

In [6]:
data['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [7]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [8]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [9]:
x = data.drop(columns = 'quality')
y = data['quality']

In [10]:
x_train,x_test,y_train,y_test = train_test_split(data[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']],data['quality'],test_size=0.10, random_state = 0, shuffle = False)

In [11]:
x_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1434,10.2,0.540,0.37,15.4,0.214,55.0,95.0,1.00369,3.18,0.77,9.0
1435,10.2,0.540,0.37,15.4,0.214,55.0,95.0,1.00369,3.18,0.77,9.0
1436,10.0,0.380,0.38,1.6,0.169,27.0,90.0,0.99914,3.15,0.65,8.5
1437,6.8,0.915,0.29,4.8,0.070,15.0,39.0,0.99577,3.53,0.54,11.1


In [12]:
x_test

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1439,7.3,0.670,0.02,2.20,0.072,31.0,92.0,0.99566,3.32,0.68,11.066667
1440,7.2,0.370,0.32,2.00,0.062,15.0,28.0,0.99470,3.23,0.73,11.300000
1441,7.4,0.785,0.19,5.20,0.094,19.0,98.0,0.99713,3.16,0.52,9.566667
1442,6.9,0.630,0.02,1.90,0.078,18.0,30.0,0.99712,3.40,0.75,9.800000
1443,6.9,0.580,0.20,1.75,0.058,8.0,22.0,0.99322,3.38,0.49,11.700000
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.00,0.090,32.0,44.0,0.99490,3.45,0.58,10.500000
1595,5.9,0.550,0.10,2.20,0.062,39.0,51.0,0.99512,3.52,0.76,11.200000
1596,6.3,0.510,0.13,2.30,0.076,29.0,40.0,0.99574,3.42,0.75,11.000000
1597,5.9,0.645,0.12,2.00,0.075,32.0,44.0,0.99547,3.57,0.71,10.200000


In [13]:
y_train

0       5
1       5
2       5
3       6
4       5
       ..
1434    6
1435    6
1436    5
1437    5
1438    5
Name: quality, Length: 1439, dtype: int64

In [14]:
y_test

1439    6
1440    7
1441    6
1442    5
1443    5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 160, dtype: int64

In [15]:
clf = DecisionTreeClassifier(random_state=0)

In [16]:
clf.fit(x_train,y_train)

DecisionTreeClassifier(random_state=0)

In [17]:
y_hat=clf.predict(x_test)
y_hat

array([5, 6, 5, 6, 6, 5, 5, 6, 5, 5, 6, 6, 6, 6, 5, 6, 5, 4, 5, 6, 7, 7,
       5, 6, 5, 5, 5, 5, 6, 5, 5, 5, 6, 5, 6, 6, 7, 6, 7, 4, 7, 7, 7, 5,
       6, 4, 6, 5, 6, 7, 6, 7, 7, 6, 5, 6, 5, 5, 6, 4, 6, 5, 5, 5, 6, 5,
       6, 6, 5, 6, 7, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 4, 6, 6, 6, 5, 5,
       6, 5, 6, 6, 5, 5, 5, 6, 5, 6, 5, 6, 5, 7, 6, 6, 5, 7, 6, 6, 6, 7,
       6, 6, 6, 7, 6, 4, 5, 6, 4, 5, 5, 5, 5, 6, 6, 6, 5, 5, 6, 5, 5, 6,
       7, 5, 6, 5, 6, 7, 6, 6, 7, 6, 7, 6, 5, 7, 7, 6, 7, 7, 5, 6, 6, 6,
       5, 6, 6, 6, 6, 6], dtype=int64)

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
accuracy_score(y_test,y_hat)

0.41875

In [20]:
clf.score(x_train,y_train)

1.0

In [21]:
clf.score(x_test,y_test)

0.41875