In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## In this notebook, First I have done some exploration on the data using matplotlib and seaborn. Then, I use different classifier models to predict the quality of the wine.

1. Random Forest Classifier

2. Stochastic Gradient Descent Classifier

3. Support Vector Classifier(SVC)

Then I use cross validation evaluation technique to optimize the model performance.

1. Grid Search CV

2. Cross Validation Score

In [None]:
#Importing required packages.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
%matplotlib inline

In [None]:
#Loading dataset
path = "../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv"
wine = pd.read_csv(path)
wine.shape

In [None]:
df = wine
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
cols = list(df.columns)
print("Number of Columns:", len(cols))

In [None]:
# Let's do some plotting to know how the data columns are distributed in the dataset
for i in cols[:-1]:
    print(i)
    fig = plt.figure(figsize = (10,6))
    sns.barplot(x = 'quality', y = i, data = wine)
    plt.title("Bar Plot for Quality vs {}".format(i))
    plt.show()

In [None]:
#Making binary classificaion for the response variable.
#Dividing wine as good and bad by giving the limit for the quality
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
wine['quality'] = pd.cut(wine['quality'], bins = bins, labels = group_names)

In [None]:
wine['quality'].value_counts()

In [None]:
#Now lets assign a labels to our quality variable
label_quality = LabelEncoder()

In [None]:
#Bad becomes 0 and good becomes 1 
wine['quality'] = label_quality.fit_transform(wine['quality'])

In [None]:
wine['quality'].value_counts()

In [None]:
sns.countplot(wine['quality'])

In [None]:
#Now seperate the dataset as response variable and feature variabes
X = wine.drop('quality', axis = 1)
y = wine['quality']

In [None]:
#Train and Test splitting of data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
#Applying Standard scaling to get optimized result
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Our training and testing data is ready now to perform machine learning algorithm

### Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [None]:
#Let's see how our model performed

print(classification_report(y_test, pred_rfc))

Random forest gives the accuracy of 88%

In [None]:
#Confusion matrix for the random forest classification
print(confusion_matrix(y_test, pred_rfc))

print(accuracy_score(pred_rfc, y_test))

In [None]:
# Stochastic Gradient Decent Classifier

sgd = SGDClassifier(penalty=None)
sgd.fit(X_train, y_train)
pred_sgd = sgd.predict(X_test)

print(classification_report(y_test, pred_sgd))

print(confusion_matrix(y_test, pred_sgd))

print(accuracy_score(pred_sgd, y_test))

Stochastic Gradient Decent Classifier gives the accuracy of 85%

In [None]:
# Support Vector Classifier

svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)


print(classification_report(y_test, pred_svc))

print(confusion_matrix(y_test, pred_svc))

print(accuracy_score(pred_svc, y_test))

Support Vector Classifier gives the accuracy of 87%

Let's try to increase our accuracy of models

## Grid Search CV

In [None]:
#Finding best parameters for our SVC model
param = {
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [None]:
grid_svc.fit(X_train, y_train)

#Best parameters for our svc model
grid_svc.best_params_

In [None]:
#Let's run our SVC again with the best parameters.
svc2 = SVC(C = 1.2, gamma =  0.9, kernel= 'rbf')
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)

print(classification_report(y_test, pred_svc2))

print(confusion_matrix(y_test, pred_svc2))

print(accuracy_score(pred_svc2, y_test))

## SVC improves from 87% to 89.68% using Grid Search CV

In [None]:
param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}

grid_rfc = GridSearchCV(rfc, param_grid=param_grid, scoring='accuracy', cv=10)

grid_rfc.fit(X_train, y_train)

#Best parameters for our svc model
grid_rfc.best_params_

In [None]:
grid_rfc

In [None]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=2)
rfc.fit(X_train, y_train)
pred_rfc2 = rfc.predict(X_test)

print(classification_report(y_test, pred_rfc2))

print(confusion_matrix(y_test, pred_rfc2))

print(accuracy_score(pred_rfc2, y_test))

RFC improves from 88% to 89% using Grid Search CV

In [None]:
#Now lets try to do some evaluation for random forest model using cross validation.
rfc_eval = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
rfc_eval.mean()

## Random forest accuracy increases from 88% to 91 % using cross validation score