## Importing the basic packages

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os

## Importing the data

In [None]:
os.listdir()

In [None]:
data = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
data_backup = data.copy()

# Understanding the data and preparation

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
for i in data.columns:
    print(f'{i} has {len(np.unique(data[i]))} unique values')

In [None]:
for i in data.columns:
    nas = []
    for a in data[i].isna():
        if a == True:
            nas.append(a)
    print(f'{i} has {len(nas)} na values')

In [None]:
data.columns

In [None]:
data.rename(columns = {'fixed acidity':"fixed_acidity", 'volatile acidity':'volatile_acidity', 'citric acid':'citric_acid', 'residual sugar':'residual_sugar',
       'chlorides':'chlorides', 'free sulfur dioxide':'free_sulfer_dioxide', 'total sulfur dioxide':'total_sulfer_dioxide', 'density':'density',
       'pH':'pH', 'sulphates':'sulphates', 'alcohol':'alcohol', 'quality':'quality'},inplace=True)

In [None]:
data.columns

## Data Exploration

In [None]:
sns.countplot(x='quality',data= data)
for i in np.unique(data.quality):
    counter = 0
    for value in data.quality:
        if value == i:
            counter = counter + 1
    print(f'wine with quality rating of {i} has {counter} instances')
        

In [None]:
plt.figure(figsize = (10,25))

subplots = [611,612,613,614,615,616]
for index,i in enumerate(data.columns[0:6]):
    plt.subplot(subplots[index])
    plt.hist(data[i],edgecolor = 'black')
    plt.xlabel(i)
    plt.ylabel('frequency of occurance')
    plt.subplots_adjust(hspace= 0.4)

In [None]:
plt.figure(figsize = (10,25))
subplots = [611,612,613,614,615,616]
for index,i in enumerate(data.columns[6:12]):
    plt.subplot(subplots[index])
    plt.hist(data[i],edgecolor = 'black')
    plt.xlabel(i)
    plt.ylabel('frequency of occurance')
    plt.subplots_adjust(hspace=0.4)

In [None]:
fig , ((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8),(ax9,ax10),(ax11,ax12)) = plt.subplots(6,2)
fig.set_figheight(15)
fig.set_figwidth(10)
plt.tight_layout(h_pad=2,w_pad=3)
figures = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10,ax11,ax12]
for index, i in enumerate(figures):
    if index != 11:
        plt.subplot(i)
        sns.violinplot(data = data , y = data.columns[index],x= 'quality')
fig.delaxes(ax12)


In [None]:

sns.pairplot(data=data,hue="quality")

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(data.corr(),annot=True)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
Model = ExtraTreesClassifier()

In [None]:
Model.fit(data.drop('quality',axis=1),data.loc[:,'quality'])

In [None]:
Model_Series =  pd.Series(Model.feature_importances_,index=data.columns[:11])

In [None]:
Model_Series

In [None]:
Model_Series.plot(kind = 'barh')

Based on the violin plots, pairplots and the analysis of the correlation matrix, we can see that no variable has a strong relation with the target variable i.e quality. Variables like alcohol , volatile acidity , sulphates, total sulfer dioxide, density and citric acid levels have a weak or very moderate relation with the quality rating assigned to them. 

On analysing the importances of features to predict the quality, we can see that alcohol, sulphates, total sulfer dioxide , density and volatile acidity do have slightly more of a role in predicting the quality rating than the other features

Hence we should use Alcohol, Sulphates, Density, Total Sulfer dioxide and Volatile acidity as our estimators

## Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score , classification_report
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score

In [None]:
## Building a model with all variables

In [None]:
X = data.iloc[:,-12:-1]

In [None]:
X.head()

In [None]:
Y = data.iloc[:,-1]

In [None]:
Y.head()

In [None]:
X_data = data.drop('quality',axis=1)

In [None]:
X_data.head()

In [None]:
Y_data = Y

In [None]:
RFC = RandomForestClassifier()

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=50)

In [None]:
cross_val_score(RFC,X_data[['volatile_acidity','sulphates','total_sulfer_dioxide','alcohol','density','citric_acid']],Y_data,scoring='accuracy',cv = cv,n_jobs=-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data[['volatile_acidity','sulphates','total_sulfer_dioxide','alcohol','density','citric_acid']],Y,test_size= 0.2,random_state = 50)

In [None]:
RFC_default = RandomForestClassifier().fit(X_train,y_train)

In [None]:
accuracy_score(y_test,RFC_default.predict(X_test))

In [None]:
print(classification_report(y_test,RFC_default.predict(X_test)))

From the predictions and evaluations we can see that the predictors weren't able to make any accurate predictions for the wines rated 3 and 8. It also made very few predictions for wines of quality 4 and 7, most of which also happen to be inaccurate.

This could be due to very low instances for such quality of wines or also due to the fact that our classes are highly imbalanced i.e we have excessively large no. of instances for wines rated 5 and 6 and very few wines rated 3,4 and 8. 

In [None]:
# Searching for best estimators

In [None]:
parameters = {"max_depth" : np.linspace(10,100,10),"min_samples_leaf":[1,2,4],'min_samples_split':[2,5,10],'bootstrap':[True,False]}

In [None]:
empty = RandomForestClassifier()

In [None]:
Grid = GridSearchCV(empty,parameters,refit=True).fit(X_train,y_train)

In [None]:
Grid.best_score_

In [None]:
Grid.best_params_

In [None]:
Grid.best_estimator_

In [None]:
RFC = RandomForestClassifier(max_depth=60).fit(X_train,y_train)

In [None]:
accuracy_score(y_test,RFC.predict(X_test))

In [None]:
print(classification_report(y_test,RFC.predict(X_test)))

Despite trying to search for the best parameters to use for the random forest model, we still couldn't get a more accurate model that could also predict wines of quality 3 and 8.

In [None]:
# Lets try to create a new classification that we will use for checking quality of wine

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3-Q1

In [None]:
lower_outliers = data < Q1-IQR*1.5

In [None]:
higher_outliers = data > Q1+IQR*1.5

In [None]:
data[lower_outliers|higher_outliers].count()

In [None]:
data[lower_outliers|higher_outliers].count()/data.count()

From above we can conclude that we have a significant amount of outliers in each variable, which could affect the mean. Hence, in order to check how wines of different quality rating defer from each other, we need to use the median

In [None]:
print('difference in medians of variables of wine with quality 3 and 4')
print(data[data.quality==4].describe().iloc[5,] - data[data.quality==3].describe().iloc[5,])

print('difference in medians of variables of wine with quality 4 and 5')
print(data[data.quality==5].describe().iloc[5,] - data[data.quality==4].describe().iloc[5,])

print('difference in medians of variables of wine with quality 5 and 6')
print(data[data.quality==6].describe().iloc[5,] - data[data.quality==5].describe().iloc[5,])

print('difference in medians of variables of wine with quality 6 and 7')
print(data[data.quality==7].describe().iloc[5,] - data[data.quality==6].describe().iloc[5,])

print('difference in medians of variables of wine with quality 7 and 8')
print(data[data.quality==8].describe().iloc[5,] - data[data.quality==7].describe().iloc[5,])

We can see that there is very less difference in the median of variables of wines rated 3 and 4. However, there is some difference between the median of variables of wines rated 4 and 5, 
- Hence we can try to put wines rated 3 and 4 in one group. 

Also since wines rated 5 and 6 have very less difference between them,
- We can put them in another group. 

- We can also try to put wines rated 7 and 8 in another group and call them good. 

In [None]:
quality_description = []
for i in data_backup.quality:
    if i <=4 :
        quality_description.append('bad')
    elif i <=6:
        quality_description.append('average-moderately good')
    else:
        quality_description.append('good')

In [None]:
data.quality = quality_description

In [None]:
sns.countplot(x='quality',data= data)

In [None]:
fig , ((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8),(ax9,ax10),(ax11,ax12)) = plt.subplots(6,2)
fig.set_figheight(15)
fig.set_figwidth(10)
plt.tight_layout(h_pad=2,w_pad=3)
figures = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10,ax11,ax12]
for index, i in enumerate(figures):
    if index != 11:
        plt.subplot(i)
        sns.violinplot(data = data , y = data.columns[index],x= 'quality')
fig.delaxes(ax12)


We can see that good wines have some differences compared to wines rated bad or average based on levels of fixed and volatile acidity, citric acid and amount of alcohol. However, it appears to difficult to differentiate between wines rated bad and average / moderate 

In [None]:
sns.pairplot(data=data,hue="quality")

From the pair-plots, again it can be seen that while it is easy to differentiate a good wine from the remaining 2 based on alcohol levels, sulphates, volatile_acidity and citric_acid levels, its very difficult to differentiate between wines of bad and average quality. We can't find any specific variables to clearly differentiate between bad and average wines .

In [None]:
Model = ExtraTreesClassifier()

In [None]:
Model.fit(data.drop('quality',axis=1),data.loc[:,'quality'])

In [None]:
Model_Series = pd.Series(Model.feature_importances_,index= data.columns[0:11])

In [None]:
Model_Series

In [None]:
Model_Series.plot(kind = 'barh')

On analysing the features that could help in predicting the quality of red wines, we can see that alcohol, sulphates, volatile acidity, sulphates , citric acid , density and total sulfer dioxide are the top 6 features that have more of a role to play than the other features. 

Hence, we will use alcohol, sulphates, volatile acidity, sulphates , citric acid , density and total sulfer dioxide for our new model



In [None]:
X_data = data.drop('quality',axis=1)

In [None]:
Y = data.iloc[:,-1]

In [None]:
print('quality :','bad\n',data[data.quality=='bad'].describe().iloc[5,],'\n')
print('quality :','good\n',data[data.quality=='good'].describe().iloc[5,],'\n')
print('quality :','average moderately-good\n',data[data.quality=='average-moderately good'].describe().iloc[5,],'\n')

From the above median comparisons, we can see that there are very less differences beteen wines rate average and bad. This could again create problems.

In [None]:
RFC = RandomForestClassifier()

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=0)

In [None]:
cross_val_score(RFC,X_data[['alcohol','sulphates','volatile_acidity','total_sulfer_dioxide','density','citric_acid']],Y,scoring='accuracy',cv = cv,n_jobs=-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data[['alcohol','sulphates','volatile_acidity','total_sulfer_dioxide','density','citric_acid']],Y,test_size= 0.2,random_state = 50)

In [None]:
RFC = RandomForestClassifier().fit(X_train,y_train)

In [None]:
accuracy_score(y_test,RFC.predict(X_test))

On using different classification values, our accuracy levels have improved drastically. However, to see how well the model has performed in identifying different classes, lets look at the classification report

In [None]:
print(classification_report(y_test,RFC.predict(X_test)))

This new model with new target categories has recorded high precision and recall values for predicting quality ratings for  average-moderatately good wines. It also has a decent precision and recall values for good wines. However, it failed predicting which red wines were actually bad.  

This is due to, again, the highly imbalanced number of instances for the three types of red wine based on quality.

However, We can also say that it is very easy to rate a wine average or somewhat good, but very few wines are getting very bad or very good ratings. People who are unable to make a clear cut decision rate the wines close to 5 or 6. But very few wines make the cut for higher rating of 7 and 8 or even above. Even fewer wines get a rating as low as 4 or 3, implying that the quality of most of the wines made in general is pretty much ok.  

In [None]:
# Using grid search for tuning

In [None]:
parameters = {"max_depth" : np.linspace(10,100,10),"min_samples_leaf":[1,2,4],'min_samples_split':[2,5,10],'bootstrap':[True,False]}

In [None]:
empty = RandomForestClassifier()

In [None]:
Grid = GridSearchCV(empty,parameters,refit=True).fit(X_train,y_train)

In [None]:
Grid.best_params_

In [None]:
Grid.best_estimator_

In [None]:
Grid.best_score_

In [None]:
# Creating a new model with best parameters

In [None]:
RFC = RandomForestClassifier(max_depth=100).fit(X_train,y_train)

In [None]:
accuracy_score(y_test,RFC.predict(X_test))

In [None]:
print(classification_report(y_test,RFC.predict(X_test)))

After tuning the parameters, we now have a model that, though has poor recall and precision scores, could make a couple of accuracy predictions for instances of bad red wines..  However, the precision and recall scores for predicting instances of good and average wines have slighly improved and become better.  

On the whole, we can from all the comparisons made between wines of different ratings based on various parameters, we can say that the levels of alcohol, sulphates, citric acid, density, total sulfer dioxide and volatile acidity do have some bearing on the quality rating assigned to red wines