## 1. Data Loading and Description

__Importing Packages__

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# To calculate correlation on features
from scipy import stats
from sklearn.metrics import confusion_matrix, classification_report

from warnings import filterwarnings
filterwarnings('ignore')

# allow plots to appear directly in the notebook
%matplotlib inline

#### Importing the Dataset

In [None]:
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', index_col=0)
data.head()

What are the **features**?
- Fixed Acidity
- Volatile Acidity
- Citric Acid
- Residual Sugar
- Chlorides
- Free Sulfur Dioxide
- Total Sulfur Dioxide
- Density
- pH
- Sulphates
- Alcohol

What is the **response**?
- Quality: How quality of the wine?

## 2. Exploratory Data Analysis

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().transpose()

There are 1599 **observations**

__Distribution of Features__

In [None]:
f, axes = plt.subplots(2, 2, figsize=(10, 7), sharex=True)                                      # Set up the matplotlib figure

sns.barplot(x = 'quality', y = 'volatile acidity', data = data, ax=axes[0, 0])
sns.barplot(x = 'quality', y = 'citric acid', data = data, ax=axes[0, 1])
sns.barplot(x = 'quality', y = 'residual sugar', data = data, ax=axes[1, 0])
sns.barplot(x = 'quality', y = 'chlorides', data = data, ax=axes[1, 1])

In [None]:
f, axes = plt.subplots(2, 2, figsize=(10, 7), sharex=True)                                      # Set up the matplotlib figure

sns.barplot(x = 'quality', y = 'free sulfur dioxide', data = data, ax=axes[0, 0])
sns.barplot(x = 'quality', y = 'total sulfur dioxide', data = data, ax=axes[0, 1])
sns.barplot(x = 'quality', y = 'density', data = data, ax=axes[1, 0])
sns.barplot(x = 'quality', y = 'pH', data = data, ax=axes[1, 1])

In [None]:
f, axes = plt.subplots(1, 2, figsize=(11, 5), sharex=True)                                      # Set up the matplotlib figure

sns.barplot(x = 'quality', y = 'sulphates', data = data, ax=axes[0])
sns.barplot(x = 'quality', y = 'alcohol', data = data, ax=axes[1])


In [None]:
for feature in data.columns:
    if feature == 'quality':
        continue
    sns.jointplot(x=feature, y='quality', data=data, kind='kde', stat_func=stats.pearsonr)
    plt.show()

__Observation__<br/>
- Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. /n
- I am leaving to you which to find which features are highly correlated.

### Visualising Pairwise correlation

In [None]:
sns.pairplot(data, vars=["pH", "density", "alcohol", "free sulfur dioxide"], hue="quality", height = 2, aspect = 1.5)

__Observation__<br/>
- pH and density are looking same with respective of the Quality of wine look at Diagonal graphs. But alcohol is varying.

In [None]:
sns.pairplot(data, vars=["residual sugar", "citric acid", "sulphates", "volatile acidity"], hue="quality", height = 2, aspect = 1.5)

##### Note : [you can skip  this If you understand above graphs]
Above plots are looking so messy let's Divid wine as good and bad by giving the limit for the quality

In [None]:
#Making binary classificaion for the response variable.
#Dividing wine as good and bad by giving the limit for the quality
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
data['quality'] = pd.cut(data['quality'], bins = bins, labels = group_names)

In [None]:
#Now lets assign a labels to our quality variable
label_quality = LabelEncoder()

In [None]:
#Bad becomes 0 and good becomes 1 
data['quality'] = label_quality.fit_transform(data['quality'])

In [None]:
data['quality'].value_counts()

In [None]:
sns.pairplot(data, vars=["pH", "density", "alcohol", "free sulfur dioxide"], hue="quality", size = 2, aspect = 1.5)

### Calculating and plotting heatmap correlation

In [None]:
sns.heatmap( data.corr(), annot=True );

__Observation__

- The diagonal of the above matirx shows the auto-correlation of the variables. It is always 1. You can observe that the correlation between __alcohol and quality is highest i.e. 0.41__ and then between __quality and citric acid i.e. 0.21__.

- correlations can vary from -1 to +1. Closer to +1 means strong positive correlation and close -1 means strong negative correlation. Closer to 0 means not very strongly correlated. variables with __strong correlations__ are mostly probably candidates for __model builing__.

## 3. Model and Training
 

__Find the Missing Values__

In [None]:
def handle_missing_values(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percentage = round(total / data.shape[0] * 100)
    return pd.concat([total, percentage], axis = 1, keys = ['total', 'percentage'])
handle_missing_values(data)

In [None]:
X = data.loc[:, data.columns != 'quality']
y = data.quality
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
#Applying Standard scaling to get optimized result
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

### Support Vector Machine Model without GridSearchCV
Note:  Linear Regression Model with GridSearcCV is implemented after this section

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)

In [None]:
print(classification_report(y_test, pred_svc))

## 4. Support Vector Machine model with GridSearchCV

In [None]:
#Finding best parameters for our SVC model
param = {
    'C': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1, 0.8, 0.9, 1, 1.1, 1.2]
}
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [None]:
grid_svc.fit(X_train, y_train)

In [None]:
#Best parameters for our svc model
grid_svc.best_params_

In [None]:
#Let's run our SVC again with the best parameters.
svc2 = SVC(C = 1.4, gamma =  0.9, kernel= 'rbf')
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(y_test, pred_svc2))

## __SVC improves from 86% to 89% using Grid Search CV__