# Introduction


## Importing necessary python libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for plotting
import matplotlib.image as mpimg
import seaborn as sns # to generate stylised plots

### Loading the input file

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

### Glimpse at each column of the data

In [None]:
df.head()

In [None]:
df.info()

### Descriptive statistics on each column

In [None]:
df.describe().T

### Check for any missing data

In [None]:
df.isna().sum()

# Exploratory Data Analysis

#### Correlations between each columns

In [None]:
fig,ax = plt.subplots(figsize = (10,6))
sns.heatmap(df.corr(),cmap = 'plasma', annot= True)

#### Let's display the barplot of the correlations of various columns with 'quality'

In [None]:
df.corr()['quality'][:-1].sort_values().plot(kind = 'barh',figsize = (13,10), colormap = 'plasma')

#### List of labels available for prediction :

In [None]:
df.quality.value_counts()

### Let's put aside the regression modelling and try doing a classification model which predicts whether the wine is good or not

In [None]:
df['quality_binary'] = df.quality.apply(lambda x : 1 if x >6 else 0)

In [None]:
df.quality_binary.value_counts()

In [None]:
X = df.drop(columns = ['quality','quality_binary']).values
y = df.quality_binary.values.reshape(-1, 1)

## Feature Engineering

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X)
X_new = X_scaler.transform(X)
#y_scaler = MinMaxScaler().fit(y)
#y_new = y_scaler.transform(y)

#### Split train set and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new,y,train_size= 0.75, random_state= 101)

#### Instantiate a Decision Tree classifier

In [None]:
from sklearn import tree
model1 = tree.DecisionTreeClassifier()
model1.fit(X_train,y_train)

#### Predicting the outputs for the test set 'X_test'

In [None]:
y_pred = model1.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,roc_auc_score, auc
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
roc_auc_score(y_test, y_pred )

In [None]:
feature_set = df.drop(columns = ['quality','quality_binary']).columns

####  A High level look into the decision tree with the depth of 2

In [None]:
import graphviz 
import pydot
dot_data = tree.export_graphviz(model1, 
                     feature_names=feature_set,
                     out_file = 'class_tree.dot',
                     class_names='quality',  
                     filled=True, rounded=True,  
                     special_characters=True)  
(graph,) = pydot.graph_from_dot_file('class_tree.dot')
graph.write_png('DT_classifier.png')

#### Display the Decision tree classifer which predicts if the wine is of good quality or not

In [None]:
plt.subplots(figsize = (15,15))
plt.axis('off')
plt.imshow(mpimg.imread('DT_classifier.png'))

#### Lets predict the output quality using the regression model

In [None]:
X = df.drop(columns = ['quality','quality_binary']).values
y = df.quality.values.reshape(-1,1)
X_scaler = MinMaxScaler().fit(X)
X_new = X_scaler.transform(X)
y_scaler = MinMaxScaler().fit(y)
y_new = y_scaler.transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new,y_new,train_size= 0.75, random_state= 101)

#### Train the Decision tree regressor using the scaled data set

In [None]:
model2 = tree.DecisionTreeRegressor()
model2.fit(X_train,y_train)

In [None]:
y_pred = model2.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))

In [None]:
dot_data2 = tree.export_graphviz(model2, 
                     feature_names=feature_set,
                     out_file = 'regression_tree.dot',
                     class_names='quality',  
                     filled=True, rounded=True,  
                     special_characters=True) 
graphviz.Source(dot_data)
(graph2,) = pydot.graph_from_dot_file('regression_tree.dot')
graph2.write_png('DT_regressor.png')

#### Display the Decision tree regressor which estimates the quality of wine 

In [None]:
plt.subplots(figsize = (50,15))
plt.axis('off')
plt.imshow(mpimg.imread('DT_regressor.png'))