In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
winequality = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

# *Data Visualization*

Lets have a look at top 5 columns in our dataset

In [None]:
winequality.head()

Now lets see if there is any missing data in our dataset. This step is crucial in any data analysis. Not having many values for any feature might make our assumptions totally wrong. And we do not want to take any risk with Wine. Do we?

In [None]:
winequality.info()

We can see from the above execution that there are no-null values. We can safely start now with checking various relations among different features using data visualization techniques

In [None]:
#SNS is very popular library in python, It is very easy to plot and infer relations between two parameters using this
import seaborn as sns 
import matplotlib.pyplot as plt

As seen above every data is in numerical form, this makes our analysis much easier as we do not have to deal with strings. We will directly jump into heatmap

In [None]:
winequality.corr()

In [None]:
f,ax = plt.subplots(figsize=(8, 8))
sns.heatmap(winequality.corr(), annot=True, fmt= '.1f',ax=ax)
plt.show()

Greater the number inside the box, higher the dependency between the two.

With the above understanding, let us see how is quality related to otehr factors

While alcohol, sulphates and citric acidity positively influence final wine quality
Volatile acidity almost negatively influences wine quality.

Few other conclusions that can be derived from heat map
1. Free sulphur dioxide and total sulphur dioxide are related (as expected)
2. Fixed acidity, citrus acidity and residual sugar influences pH
Let us start to plot each of these variables against 

In [None]:
fig = plt.figure(figsize = (20,6))
sns.regplot(x= winequality['alcohol'], y = winequality['quality'])

From scatterplot it can be seen that alcohol positively influences wine quality. Alcohol percentage > 11.5 generally gives us good review

In [None]:
fig = plt.figure(figsize = (10,6)) 
sns.barplot(y= winequality['fixed acidity'], x = winequality['quality'])

The bar-graph depicts what we observed from heatmap. There is no much dependency between wine_quality and fixed acidity

In [None]:
fig = plt.figure(figsize = (20,6))
sns.barplot(x= winequality['quality'], y = winequality['sulphates'])

In [None]:
sns.countplot(winequality['quality'])

Its clear from the above graph how our wine quality is distributed over different ratings

# *Preparing data for machine learning*

Here we categorise wine quality into 3 segments. From the above graph it is clear that most wines are in the rating 5-6, so we consider this as average rating. Anything below 5 will be a bad rating and any other rating will be good.

In [None]:
quality = winequality["quality"].values
category = []
for num in quality:
    if num<5:
        category.append("Bad")
    elif num == 5 or num == 6:
        category.append("Average")
    else:
        category.append("Good")


With the above assumption, we replace numerical quality data in our main dataset to the one with categorised rating

In [None]:
#Creating new dataset for prediction
category = pd.DataFrame(data=category, columns=["category"])
winedata = pd.concat([winequality,category],axis=1)
winedata.drop(columns="quality",axis=1,inplace=True)

In [None]:
winedata.head()

In [None]:
X= winedata.iloc[:,:-1].values
y= winedata.iloc[:,-1].values

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y =LabelEncoder()
y= labelencoder_y.fit_transform(y)

#  *Machine Learning Models using scikit library*

Here we first split our data into training and testing. Training data will contain 80% while testing data will be 20% of main dataset

We train our data on 
1. Random Forest Classifier
2. KNN
3. Logistic Regression
4. DecisionTree
5. Naive Bayes

At the end, we compare how each model will perform on our data and finalize on the model based on performance

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_result = RandomForestClassifier(n_estimators=250)
random_result.fit(X_train, y_train)
res_forest = random_result.predict(X_test)
print(classification_report(y_test, res_forest))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_result = KNeighborsClassifier()
knn_result.fit(X_train,y_train)
res_knn=knn_result.predict(X_test)
print(classification_report(y_test, res_knn))

In [None]:
from sklearn.linear_model import LogisticRegression
lr_result = LogisticRegression()
lr_result.fit(X_train, y_train)
res_logRes = lr_result.predict(X_test)
print(classification_report(y_test, res_logRes))

In [None]:
from sklearn.tree import DecisionTreeClassifier
DecTree_res = DecisionTreeClassifier()
DecTree_res.fit(X_train,y_train)
res_DecTree = DecTree_res.predict(X_test)
print(classification_report(y_test, res_DecTree))

In [None]:
from sklearn.naive_bayes import GaussianNB
NaiBay_res = GaussianNB()
NaiBay_res.fit(X_train,y_train)
res_NaiBay=NaiBay_res.predict(X_test)
print(classification_report(y_test, res_NaiBay))

In [None]:
final_result = pd.DataFrame({'models': ["Random Forest","KNN","LogisticRegression","DecisionTree", "NaiveBayes"],
                           'accuracy_score': [accuracy_score(y_test,res_forest),accuracy_score(y_test,res_knn), accuracy_score(y_test,res_logRes), 
                                              accuracy_score(y_test,res_DecTree), accuracy_score(y_test,res_NaiBay)]})

In [None]:
fig = plt.figure(figsize = (6,6))
sns.barplot(x= final_result['models'], y = final_result['accuracy_score'])

# * Conclusion*

Naive Bayes performed the worst while KNN, Logistic Regression faired slightly better
Decision Tree had accuracy score 0.81 which made it better than Naive Bayes but not so good as KNN and Logistic Regression
Random forest with the accuracy of 0.89 emerged clearly as the best one