In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1-Import the dataset

The first step is obviously to import the dataset into Python. The column names will not be changed since they are representative of the data in its respective column.

In [None]:
df=pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df

# 2-Explore and clean the dataset

The second step is to explore and clean the dataset. I will check first of all the types of variable that we have for each column.

In [None]:
df.info()

We can see clearly that we are only dealing with continuous numerical features because their dtype is float64. However, for the response variable, we are dealing with a categorical numerical variable, which is the quality of the wine, on a scale of 10. 

Let's check now the summary statistics of each variable in the dataset

In [None]:
df.describe()

First, we can see that the count for each variable in the dataset is 1599, which means that we will not have to deal with missing value. This is a great news since we  want as less as possible to impute the missing values by the mean, median or mode because they can be inacurate with respect to the observation that we have.

From this table, there does not seem to be any abnormal values. 

Let's now count the number of observations in each category for the quality of the wine.

In [None]:
df['quality'].value_counts()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

print(sns.distplot( df["quality"],color='red' ))

As we can see, there are a lot of observations in the 5 and 6 range, and a few observations in the 3, 4, 7 and 8 range. Their seems to be no value in the other ranges, which will simplify the problem. 

Let's now analyze the outliers of the numerical features of the dataset with boxplots.

In [None]:
numerical_features=["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates", "alcohol"]

for numerical_feature in numerical_features:
    plt.figure()
    df.boxplot(column=[numerical_feature],grid= False )

We can notice, from the boxplots, that there are many outliers in the dataset. However, I will try to  delete the most exagerated ones. For example, for the volatile acidity, I will delete only the values that are over 1.3 since there are too many values that are close to 1. I will then keep all the values that are between 1 and 1.3 even if they are outliers. I will repeat the same process for the other variables

In [None]:
df=df[(df['volatile acidity']<1.1) & (df['citric acid']<0.9) & (df['residual sugar']<10.0) & (df['chlorides']<0.3) & (df['free sulfur dioxide']<45) & (df['total sulfur dioxide']<250.0) & (df['pH']<3.7) & (df['sulphates']<1.5)]
df

As we can see, there are still 1512 observations remaining in the dataset after having removed the outliers compared to 1599 observations before, which is reasonable to build a machine learning model.

Moreover, to simplify the prediction, I will put the wine quality in a binary classification. 0 will be a bad wine (3,4 and 5) and 1 will be a good wine (6,7,8).

In [None]:
df.loc[(df['quality']<7) , 'quality'] = 0
df.loc[(df['quality']>=7) , 'quality'] = 1

# 3-Features selection

In this step, I will choose the most important features that I will include in my machine learning model. Since my response variable is categorical (3+ categories) and all the other variables are continuous, I will perform an ANOVA test with my independent (feature) and dependent (response) variables. If the p-value is under 0.01, that means that there is a relationship between the response variable and the feature, which means that I will keep that feature in my model. Otherwise, I will not keep that feature in my model.

In [None]:
import scipy.stats as stats

numerical_features=["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates", "alcohol"]

for numerical_feature in numerical_features:
    p_value= stats.ttest_ind(df[numerical_feature][df['quality'] == 0],
               df[numerical_feature][df['quality'] == 1], equal_var=False
                  ).pvalue
    if p_value<0.05:
        print('We keep the', numerical_feature, 'in the model', p_value)
    else:
        print('We do not keep the', numerical_feature, 'in the model', p_value)

From my ANOVA tests, we can clearly see that I will keep all my features except the residual sugar in my model to predict the wine quality.

# 4-Build the Random Forest model

Now that I have chosen all the features to predict the wine quality, I will build my machine learning model. I will first set x1 as my features that I chose earlier and y as my response variable (wine quality). I will thereafter split my dataset into two sets and scale my features to get a better accuracy.

In [None]:
X1=df.drop(columns=['quality'])
y=df['quality']

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X1,y,test_size=0.3,random_state=0)

from sklearn.preprocessing import StandardScaler

X_train=StandardScaler().fit_transform(X_train)
X_test=StandardScaler().fit_transform(X_test)

Now, it is time to fit the model. To get the best accuracy as possible, I will test different number of estimators to see which one gives the best accuracy. I will then choose that number of estimators to predict and to calculate my final accuracy.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

RF = RandomForestClassifier()
RF_reg_parameters = { 'n_estimators': [1,5,10,50,100,200,500] }
grid_RF_acc = GridSearchCV(RF, param_grid = RF_reg_parameters,cv=10)
grid_RF_acc.fit(X_train, y_train)

My optimal number of estimators, as it is mentioned below, is 200.

In [None]:
print(grid_RF_acc.best_estimator_.n_estimators)

It is now the time to calculate the accuracy score.

In [None]:
from sklearn import metrics

y_pred = grid_RF_acc.predict(X_test)
print('RF Accuracy =', metrics.accuracy_score(y_test, y_pred))

In [None]:
RF.fit(X_train,y_train)

feature_imp = pd.Series(RF.feature_importances_,index=["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates", "alcohol"]).sort_values(ascending=False)
feature_imp

As we can notice above, free sulfur dioxide, ph and chlorides are the lest important features to predict the wine quality with the RF model. I will then repeat the steps above to calculate my accuracy with my new Random Forest model.

In [None]:
X2=df.drop(columns=['quality','free sulfur dioxide','pH','chlorides'])
y=df['quality']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X2,y,test_size=0.3,random_state=0)

In [None]:
RF = RandomForestClassifier()
RF_reg_parameters = { 'n_estimators': [1,5,10,50,100,200,500] }
grid_RF_acc = GridSearchCV(RF, param_grid = RF_reg_parameters,cv=10)
grid_RF_acc.fit(X_train, y_train)

In [None]:
print(grid_RF_acc.best_estimator_.n_estimators)

In [None]:
y_pred = grid_RF_acc.predict(X_test)
print('RF Accuracy =', metrics.accuracy_score(y_test, y_pred))

As we can see, the accuracy is better when I remove the least important features in my model. I will however try to improve my prediction by choosing another Machine Learning model.

# 5-Build the XGBoost model

In [None]:
from xgboost import XGBClassifier

X1=df.drop(columns=['quality'])
y=df['quality']

X_train,X_test,y_train,y_test=train_test_split(X1,y,test_size=0.3,random_state=0)


XGB = XGBClassifier()
XGB_reg_parameters = { 'n_estimators': [1,5,10,50,100,200,500] }
grid_XGB_acc = GridSearchCV(RF, param_grid = XGB_reg_parameters,cv=10)
grid_XGB_acc.fit(X_train, y_train)

In [None]:
print(grid_XGB_acc.best_estimator_.n_estimators)

In [None]:
y_pred = grid_XGB_acc.predict(X_test)
print('XGB Accuracy =', metrics.accuracy_score(y_test, y_pred))

In [None]:
XGB.fit(X_train,y_train)

feature_imp = pd.Series(XGB.feature_importances_,index=["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates", "alcohol"]).sort_values(ascending=False)
feature_imp

In [None]:
X2=df.drop(columns=['quality','citric acid','pH','chlorides','residual sugar'])
y=df['quality']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X2,y,test_size=0.3,random_state=0)

In [None]:
XGB = XGBClassifier()
XGB_reg_parameters = { 'n_estimators': [1,5,10,50,100,200,500] }
grid_XGB_acc = GridSearchCV(RF, param_grid = XGB_reg_parameters,cv=10)
grid_XGB_acc.fit(X_train, y_train)

In [None]:
print(grid_XGB_acc.best_estimator_.n_estimators)

In [None]:
y_pred = grid_XGB_acc.predict(X_test)
print('XGB Accuracy =', metrics.accuracy_score(y_test, y_pred))

# 6-Conclusion

Even though every tested model gave a good accuracy, the random forest one with a reduced number of components stays the best option for this dataset.