# Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



# data visualization libraries.

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis

In [None]:
# in read_csv() function, we have passed the location of input data file. 

red_wine_data = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
# Take a closer look at the data,  we take help of the ".head()" function of th pandas library which returns the first five 
# observations of the data set. 

# similarly, tail() returns the last five observations of the dataset.

red_wine_data.head()

In [None]:
#shape

red_wine_data.shape

The describe() function in pandas is very handy in getting various summary statistics. This function returns the count, mean, standard deviation, minimum and maximum values and the quantiles of the data

In [None]:
red_wine_data.describe()

In [None]:
red_wine_data.columns

In [None]:
red_wine_target = red_wine_data['quality']
red_wine_target.head()

In [None]:
red_wine_target.unique()

In [None]:
red_wine_target.value_counts()

Rename columns

In [None]:
red_wine_data.rename(columns={'fixed acidity':'fixed_acidity', 'volatile acidity': 'volatile_acidity', 'citric acid':'citric_acid', 
                              'residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide', 'total sulfur dioxide':'total_sulfur_dioxide',},
                        inplace = True)


In [None]:
red_wine_data.columns

In [None]:
# info()
red_wine_data.info()

In [None]:
# missing values

#isna()

red_wine_data.isna().sum()

In [None]:
# isnull()

red_wine_data.isnull().sum()

In [None]:
# check for duplicates

# duplicate()

duplicate_entries = red_wine_data[red_wine_data.duplicated()]
duplicate_entries.shape

Histograms use bars to visualize data as well. Many people may not even realize there is a difference between a histogram and a bar chart. They practically look the same from a distance.

The key is that a histogram looks solely at quantitative variables while a bar chart looks at categorical variables. That’s why the bars in a histogram are typically grouped together without spacing in between the bars.

In [None]:
red_wine_data.hist(bins=10, figsize=(16,12))
plt.show()

Observations:
* The distribution of the attribute “alcohol” seems to be positively skewed i.e the curve is shifted towards the left.
* The attributes 'density' and 'pH' are quite normally distributed.
* Now looking at the attribute quality, we can observe that the wines with average quality (i.e. quality rating 5 to 7) are more than wines with bad(1-4) or good(8-10) quality.

# Correlation

Correlation:
Correlation is a statistical measure. Data correlation is a way to understand the relationship between multiple values or features in your dataset.

Every single successful data science project revolves around finding accurate correlations between the input and target variables. However more than often, we oversee how crucial correlation analysis is. 

It is recommended to perform correlation analysis before and after data gathering and transformation phases of a data science project.

 There are three different types of correlations:

* Positive Correlation: Two features (variables) can be positively correlated with each other. It means that when the value of one variable increases then the value of the other variable(s) also increases (also decreases when the other decreases).
Eg. The more time you spend running on a treadmill, the more calories you will burn.

* Negative Correlation: Two features (variables) can be negatively correlated with each other. This occurs when the value of one variable increases and the value of another variable(s) decreases (inversely proportional).
Eg. As the weather gets colder, air conditioning costs decrease.

* No Correlation: Two features might not have any relationship with each other. This happens when the value of a variable is changed then the value of the other variable is not impacted.
Eg. There is no relationship between the amount of tea drunk and level of intelligence.
* Each of these correlation types exists in a spectrum represented by values from -1 to +1 where slight or high positive correlation features can be like 0.5 or 0.7.
* A very strong and perfect positive correlation is represented by a correlation score of 0.9 or 1.
* If there is a strong negative correlation, it will be represented by a value of -0.9 or -1. Values close to zero indicates no correlation.

In [None]:
red_wine_data.corr()

# Data Visualization

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(red_wine_data.corr(), cmap='bwr', annot =True) # annot = True: to display the correlation value in the graph


Observations:
* Alcohol has the highest positive correlation with wine quality, followed by the various other variables such as acidity, sulphates, density & chlorides.
* There is a relatively high positive correlation between fixed_acidity and citric_acid, fixed_acidity and density.
* There is a relatively high negative correlation between fixed_acidity and pH.
* Density has a strong positive correlation with fixed_acidity, whereas it has a strong negative correlation with alcohol.
* citric acid & volatile acidity have negative correlation.
* free sulphur dioxide & total sulphur dioxide have positive correlation.

In [None]:
# count plot

plt.figure(figsize=(12,8))
sns.countplot(red_wine_data.quality)

The average(5-7) quality of wines are more than good(1-4) and bad(8-10) quality of wines.

In [None]:
sns.pairplot(red_wine_data)

A box plot is a great way to get a visual sense of an entire range of data. It can tell you about your outliers and what their values are. It can also tell you if your data is symmetrical, how tightly your data is grouped, and if and how your data is skewed.

Box plots divides data into its quartiles. The “box” shows a user the data set between the first and third quartiles.

The median gets drawn somewhere inside the box and then you see the most extreme non-outliers to finish the plot. Those lines are known as the “whiskers”. If there are any outliers then those can be plotted as well.

With box plots you can answer how diverse or uniform your data might be. You can identify what is normal and what is extreme. Box plots help give a shape to your data that is broad without sacrificing the ability to look at any piece and ask more questions.

It displays the five-number summary of a set of data. The five-number summary is:

* minimum
* first quartile (Q1)
* median
* third quartile (Q3)
* maximum

In [None]:
# sns.boxplot(y, red_wine_data['alcohol'], palette='GnBu_d')
# plt.title("Boxplot of Quality of alcohol")
# plt.show()

* The above plot shows the increase in the quality of wine with an increase in alcohol. The quality of the wine is directly related to the amount of alcohol in the wine. More the alcohol in the wine, the better will be the quality.
* Also, the points lying outside the whiskers(the lines extending from the rectangular box) are the outliers.

Dviding wine as good and bad by giving the limit for the quality.

# Preprocesser Encoding

In [None]:
bins = (2, 6, 8)
group_names = ['bad','good']

red_wine_data['quality'] = pd.cut(red_wine_data['quality'], bins = bins, labels = group_names)

label_quality = LabelEncoder()

# bad will be 0 and good will be 1

red_wine_data['quality'] = label_quality.fit_transform(red_wine_data['quality'])
print(red_wine_data['quality'].value_counts())
sns.countplot(red_wine_data['quality'])
plt.show()
               

# Data Modeling

In [None]:
y = red_wine_data['quality']                            # set 'quality' as target variable(y)
x = red_wine_data.drop(['quality'], axis=1)            # all coumns except quality are input variables(X)

In [None]:
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25,  random_state = 50)

Scaling

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

x_test

# Decision Tree 

In [None]:
dtc = DecisionTreeClassifier(max_depth=200)
dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)
dtc_acc = accuracy_score(y_test, dtc_pred)
dtc_acc

In [None]:
Ks = 100
mean_acc = np.zeros(Ks-1)

for n in range(1, Ks):
    dtc = DecisionTreeClassifier(max_depth = n).fit(x_train, y_train)
    yhat = dtc.predict(x_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    
mean_acc

In [None]:
print("the best accuracy is", mean_acc.max(), "with depth =", mean_acc.argmax()+1)

Decision Tree Classification Report

In [None]:
cf = metrics.classification_report(yhat, y_test)
print(cf)

# Random Forest Classifier

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_preds = rfc.predict(x_test)
rfc_acc = accuracy_score(rfc_preds, y_test)
print (100*rfc_acc)

In [None]:
Ks = 100
mean_acc = np.zeros(Ks-1)

for n in range(1, Ks):
    rfc = RandomForestClassifier(n_estimators = n).fit(x_train, y_train)
    yhat = rfc.predict(x_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    
mean_acc

In [None]:
print ('The best accuracy is ', 100*mean_acc.max(), "with n_estimator = ", mean_acc.argmax()+1)