In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing

In [2]:
df = pd.read_csv("winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
y = df.quality
X = df.drop('quality', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [6]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
551,9.4,0.43,0.24,2.8,0.092,14.0,45.0,0.998,3.19,0.73,10.0
1250,7.1,0.6,0.01,2.3,0.079,24.0,37.0,0.99514,3.4,0.61,10.9
1379,7.5,0.57,0.02,2.6,0.077,11.0,35.0,0.99557,3.36,0.62,10.8
958,6.4,0.57,0.12,2.3,0.12,25.0,36.0,0.99519,3.47,0.71,11.3
41,8.8,0.61,0.3,2.8,0.088,17.0,46.0,0.9976,3.26,0.51,9.3


In [7]:
X_train_scaled = preprocessing.scale(X_train)
print(X_train_scaled)

[[ 6.16584192e-01 -5.46674616e-01 -1.54881134e-01 ... -7.79120367e-01
   4.27877886e-01 -3.96844970e-01]
 [-6.98625523e-01  4.03548917e-01 -1.34168455e+00 ...  5.73937563e-01
  -2.74257236e-01  4.52109488e-01]
 [-4.69893398e-01  2.35862411e-01 -1.29008440e+00 ...  3.16212243e-01
  -2.15745976e-01  3.57781215e-01]
 ...
 [-1.55637099e+00 -1.55406102e-01 -9.28883361e-01 ...  2.05585815e+00
  -1.57234715e-01  2.43300322e+00]
 [ 1.59119943e-01 -3.23092608e-01 -8.06882697e-05 ... -8.43551697e-01
   8.37456707e-01  7.35094308e-01]
 [-4.69893398e-01  6.83026427e-01 -4.64482025e-01 ...  4.45074903e-01
   6.61922926e-01 -9.62814608e-01]]


In [8]:
clf=tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [9]:
confidence = clf.score(X_test, y_test)
print("\nThe confidence score:\n")
print(confidence)


The confidence score:

0.565625


In [10]:
y_pred = clf.predict(X_test)

In [11]:
#converting the numpy array to list
x=np.array(y_pred).tolist()

#printing first 5 predictions
print("\nThe prediction:\n")
for i in range(0,5):
    print(x[i])
    
#printing first five expectations
print("\nThe expectation:\n")
print(y_test.head())


The prediction:

5
6
5
8
6

The expectation:

6       5
1421    5
1193    5
911     6
881     6
Name: quality, dtype: int64


In [13]:
import pickle

pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
