In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Welcome to my notebook, please give and upvote if you like it and feel free to share any feedback in the comments.In this notebook we are going to deep dive into the red wine quality dataset, study the dataset and apply a classification model to predict the quality.

![istockphoto-1301017778-170667a.jpg](attachment:2a9d24d7-e772-44c9-a51d-512715d10fe2.jpg)

# DATA PROPERTIES

In [1]:
#reading the file
df = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [1]:
#Lets take a glimpse of the dataset
df.head()

In [1]:
df.describe()

In [1]:
#checking data types
df.info()

In [1]:
#Checking null values in each column
df.isnull().sum()

In [1]:
#Checking length of dataset
print(df.shape)

In [1]:
#Target variable
df['quality'].value_counts()

# DATA VISUALIZATION

In [1]:
import plotly.express as px
df_new = df['quality'].value_counts().rename_axis('Winequality').reset_index(name='counts')
df_new
fig = px.pie(df_new, values='counts', names='Winequality')
fig.show()

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(16,10))
sns.boxplot(y='fixed acidity',x='quality',data=df)
plt.yticks(rotation =90,fontsize=12)
plt.xticks(fontsize=12)
plt.title("Impact of fixed acidity on Quality",fontsize=30)
plt.xlabel('Quality',fontsize=26)
plt.ylabel('Fixed Acidity',fontsize=26)
plt.show()

As per above distribution, lets create a binary feature for wine quality having two values Good(1) and Bad(0)

In [1]:
df.columns

In [1]:
df['Good'] =df['quality'].apply(lambda x : 1 if(x>5) else 0)
df.drop(['quality'],inplace=True,axis=1)

In [1]:
#Lets see if we have a balanced distribution 
df['Good'].value_counts()

In [1]:
plt.figure(figsize=(16,10))
sns.scatterplot(x='alcohol',y='residual sugar',data=df,hue='Good')
plt.yticks(rotation =90,fontsize=12)
plt.xticks(fontsize=12)
plt.title("PLOT- A ",fontsize=30)
plt.xlabel('Alchohol',fontsize=26)
plt.ylabel('Residual Sugar',fontsize=26)
plt.show()


In [1]:
plt.figure(figsize=(16,10))
sns.boxplot(y='residual sugar',x='Good',data=df)
plt.yticks(rotation =90,fontsize=12)
plt.xticks(fontsize=12)
plt.title("PLOT- B Checking outliers",fontsize=30)
plt.xlabel('Good',fontsize=26)
plt.ylabel('Residual Sugar',fontsize=26)
plt.show()


There are certain outliers in our data here

In [1]:
plt.figure(figsize=(16,10))
sns.boxplot(y='free sulfur dioxide',x='Good',data=df)
plt.yticks(rotation =90,fontsize=12)
plt.xticks(fontsize=12)
plt.title("PLOT- C- Checking outliers",fontsize=30)
plt.xlabel('Good',fontsize=26)
plt.ylabel('Residual Sugar',fontsize=26)
plt.show()

We can see some outliers for Free Sulphur Dioxide.

In [1]:
plt.figure(figsize=(16,10))
sns.boxplot(y='total sulfur dioxide',x='Good',data=df)
plt.yticks(rotation =90,fontsize=12)
plt.xticks(fontsize=12)
plt.title("PLOT- D Checking outliers",fontsize=30)
plt.xlabel('Good',fontsize=26)
plt.ylabel('Total Sulphur Dioxide',fontsize=26)
plt.show()

We can see some outliers for Total Sulphur Dioxide , for Good Quality Wine. 

# DATA PREPARATION

In [1]:
#Data preparation
X = df.drop(['Good'],axis=1)
Y = df['Good']
del df

In [1]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.30,random_state = 90)

# DATA MODELLING AND TUNING

To classify wine as good or not we will use Random Forest Algorithm.It is an ensemble model, basically it considers outputs from multiple trees trained on data and reports the output given by the majority of trees.  

In [1]:
#model building - Random Forest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

***Using Hyperparameter Tuning***<br>
Since there are multiple model parameters, we need to find the best parameters. This is where Hyperparameter tuning is used.It basically uses a defined grid of parameters and checks the model performance on different combinations of parameters and reports the set of parameters on which we have the best model performance.

There are 2 techniques for Hyperparameter tuning :
1. RandomizedSearchCV - Checks on random combinations.
2. GridSearchCV - Checks on all combinantions

We will use Randomized search CV as it is efficient and less time consuming.

In [1]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
#Criterion
criterion = ['gini','entropy']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion': criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [1]:
rf_tune = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf_tune,
                               param_distributions = random_grid,
                               n_iter = 100,
                               cv = 3,
                               verbose=2,
                               random_state=42,
                               n_jobs = -1)
rf_random.fit(X_train,Y_train)


In [1]:
rf_random.best_params_

In [1]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(rf_random,X_test,Y_test)

In [1]:
Y_pred = rf_random.predict(X_test)

In [1]:
accuracy_score(Y_test,Y_pred)

In [1]:
f1_score(Y_test,Y_pred)

 Finally we got an accuracy of 80.41 % with an f1 score of 0.82.