In [None]:
#Importing required packages.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
%matplotlib inline
# for high quality images
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 100
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Loading dataset
wine = pd.read_csv('winequality-red.csv')

In [None]:
#Let's check how the data is distributed
wine.head()

In [None]:
#Information about the data columns
wine.info()

In [None]:
print(wine['quality'].value_counts(normalize=True))

In [None]:
# Here we see that fixed acidity does not give any 
# specification to classify the quality.
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'fixed acidity', data = wine)
plt.show()

In [None]:
# Here we see that its quite a downing trend in the 
# volatile acidity as we go higher the quality 
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'volatile acidity', data = wine)
plt.show()

In [None]:
# Composition of citric acid go higher as we go 
# higher in the quality of the wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'citric acid', data = wine)
plt.show()

In [None]:
# almost same so significant behavious 
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'residual sugar', data = wine)
plt.show()

In [None]:
# Composition of chloride also go down as we 
# go higher in the quality of the wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'chlorides', data = wine)
plt.show()

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'free sulfur dioxide', data = wine)
plt.show()

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'total sulfur dioxide', data = wine)
plt.show()

In [None]:
# Sulphates level goes higher with the quality of wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'sulphates', data = wine)

In [None]:
# Alcohol level also goes higher as te quality of wine increases
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'alcohol', data = wine)

## Preprocessing Data for performing Machine learning algorithms

In [None]:
# Making binary classificaion for the response variable.
# Dividing wine as good and bad by giving the limit for the quality
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
wine['target'] = pd.cut(wine['quality'], bins = bins, labels = group_names)

In [None]:
wine.drop('quality', axis=1, inplace=True)

In [None]:
wine['target'].value_counts()

In [None]:
# Now lets assign a labels to our quality variable
label_quality = LabelEncoder()

In [None]:
#Bad becomes 0 and good becomes 1 
wine['target'] = label_quality.fit_transform(wine['target'])

In [None]:
wine['target'].value_counts()

In [None]:
#Now seperate the dataset as response variable and feature variabes
X = wine.drop('target', axis = 1)
y = wine['target']

In [None]:
sns.countplot(y)
plt.show()

## SMOTE oversampling

In [None]:
from imblearn.over_sampling import SMOTE
X, y = SMOTE().fit_resample(X, y)

In [None]:
sns.countplot(y)
plt.show()

In [None]:
#Train and Test splitting of data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    stratify=y,
                                                    random_state = 42)

In [None]:
#Applying Standard scaling to get optimized result
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Support Vector Classifier

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)

In [None]:
print(classification_report(y_test, pred_svc))
print("Prediction accuracy:{:.2f}".format(accuracy_score(y_test, pred_svc)))

In [None]:
# Confusion matrix for the random forest classification
cn_svm = confusion_matrix(y_test, pred_svc)
sns.heatmap(cn_svm,fmt='g',annot=True)
plt.savefig("SVM_Confusion_After_SMOTE.png")
plt.show()


## Hyper Parameter Tuning using GridSearchCV

In [None]:
#Finding best parameters for our SVC model
param = {
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [None]:
grid_svc.fit(X_train, y_train)

In [None]:
#Best parameters for our svc model
grid_svc.best_params_

In [None]:
#Let's run our SVC again with the best parameters.
svc2 = SVC(C = 1.2, gamma =  0.9, kernel= 'rbf')
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(y_test, pred_svc2))
print("Prediction accuracy:{:.2f}".format(accuracy_score(y_test, pred_svc2)))

In [None]:
cn_svm = confusion_matrix(y_test, pred_svc2)
sns.heatmap(cn_svm,fmt='g',annot=True)
plt.savefig("SVM_GridCV_Confusion_After_SMOTE.png")
plt.show()

### SVC improves result using Grid Search CV