In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset=pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

dataset.info()

In [None]:
dataset.describe()

From the info, we can see that there are 1599 non-null values, with data types both float64 and int64.
Let us check if there are any null  values present in them, (there is no need but just to be sure).

In [None]:
dataset.isnull().sum()

Thus, now there is no need to to put any random value. Now, it's time to visualize the data. For data visualization, we would be using seaborn and matplotlib.pyplot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

We will first using pairplot.
A pairplot plot a pairwise relationships in a dataset. The pairplot function creates a grid of Axes such that each variable in data will by shared in the y-axis across a single row and in the x-axis across a single column

In [None]:
sns.pairplot(dataset)
plt.show()

Also, a heatmap to find co-relations between the features


In [None]:
corr=dataset.corr()
colormap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr,cmap=colormap,xticklabels=corr.columns,yticklabels=corr.columns,annot=True)
plt.show()

we can see that the following pair are highly co-related to each other.
> citric acid and fixed acidity 
<br>
> density and fixed acidity
<br>
> free sulphur dioxide and total slphur dioxide 

We, now convert the quality (i.e, target ) into two categories, as we would be using Decision Tree Classifier here, 
as mentioned in the tips, that if the quality is greater tha 6.5 , it's "good"(1) else "bad"(0)

In [None]:
dataset['quality'] = dataset.quality.apply(lambda x : 1 if x > 6.5 else 0)

In [None]:
sns.countplot(data = dataset, x = 'quality')
plt.show()

After visualization, the first thing to do is to separate the features from the target variable and then split it into training and test sets.

In [None]:
X=dataset.drop('quality',1)
y=dataset['quality']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train,y_test= train_test_split(X,y,test_size=0.30, random_state=37)

After splitting, let's apply our first decision tree model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
dt_base=DecisionTreeClassifier(max_depth=10,random_state=4)
dt_base.fit(X_train,y_train)

Let's find the accuracy of our first model

In [None]:
from sklearn import metrics

In [None]:
y_pred=dt_base.predict(X_test)

In [None]:
acc = metrics.accuracy_score(y_test,y_pred)
print(acc)

We can see that our model is 88.5 % accurate. Let's visualize the tree
<br>
Trees can be visualized with the help of function plot_tree. So, let's start.

In [None]:
tree.plot_tree(dt_base, max_depth=2)

We can see that the size is too small for us to read. I am currently searching for a method and will update it as soon as I'll find

It's time for HYPERPARAMETER TUNING

In [None]:
dt_base.tree_.node_count

We can see that 179 nodes are present, so we can set the range approx 200 to see the result

In [None]:
param_grid = {
    'max_depth' : range(4,20,4),
    'min_samples_leaf' : range(20,200,20),
    'min_samples_split' : range(20,200,20),
    'criterion' : ['gini','entropy'] 
}
n_folds = 5

We are using gridSearchCV to train models under different hyperparameters and get the result

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
dt = DecisionTreeClassifier(random_state=34)
grid = GridSearchCV(dt, param_grid, cv = n_folds, return_train_score=True)

In [None]:
grid.fit(X_train,y_train)

After training, let's find the best parameters that is suited

In [None]:
grid.best_params_

In [None]:
best_tree = grid.best_estimator_
best_tree

Now, finding the accuracy of this best_tree

In [None]:
best_tree.fit(X_train,y_train)
y_pred_best = best_tree.predict(X_test)

In [None]:
acc = metrics.accuracy_score(y_test,y_pred_best)
print(acc)

We can see that there is not much effect in tuning the hyperparameters.  It might be possible that the model may have overfit the data because we can see from the count graph that majority have the target value 0. Thus, to get more accurate result, we could get more data to work on it