In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Wine Quality dataset analysis

## Dataset explanation
Modeling wine preferences by data mining from physicochemical properties.

- *fixed.acidity*: most acids involved with wine or fixed or nonvolatile (do not evaporate readily)
- *volatile.acidity*: the amount of acetic acid in wine, which at too high of levels can lead to an unpleasant, vinegar taste
- *citric.acid*: found in small quantities, citric acid can add 'freshness' and flavor to wines
- *residual.sugar*: the amount of sugar remaining after fermentation stops, it's rare to find wines with less than 1 gram/liter and wines with greater than 45 grams/liter are considered sweet
- *chlorides*: the amount of salt in the wine
- *free.sulfur.dioxide*: the free form of SO2 exists in equilibrium between molecular SO2 (as a dissolved gas) and bisulfite ion; it prevents microbial growth and the oxidation of wine
- *total.sulfur.dioxide*: amount of free and bound forms of S02; in low concentrations, SO2 is mostly undetectable in wine, but at free SO2 concentrations over 50 ppm, SO2 becomes evident in the nose and taste of wine
- *density*: the density of water is close to that of water depending on the percent alcohol and sugar content
- *pH*: describes how acidic or basic a wine is on a scale from 0 (very acidic) to 14 (very basic); most wines are between 3-4 on the pH scale
- *sulphates*: a wine additive which can contribute to sulfur dioxide gas (S02) levels, wich acts as an antimicrobial and antioxidant
- *alcohol*: the percent alcohol content of the wine
- *quality*: score between 0 and 10

## Load libraries

In [None]:
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Read data

In [None]:
PATH_RED_WINE = '../input/wineQualityReds.csv'
PATH_WHITE_WINE = '../input/wineQualityWhites.csv'

In [None]:
reds = pd.read_csv(PATH_RED_WINE, index_col=0)

## EDA
The distribution of samples for the different quality values is not uniform, this will be take into account when dividing the dataset in training and validation.

In [None]:
# Check distribution of quality classes
sns.catplot(x="quality", kind="count", palette="ch:.25", data=reds);

In [None]:
# Plot features
g = sns.PairGrid(reds)
g.map_diag(sns.kdeplot)
g.map_offdiag(plt.scatter);

## Normalize features

In [None]:
# Split between features and label
y = reds['quality'].values
X = reds.drop(axis=1, labels=['quality'])

In [None]:
# Normalize features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

In [None]:
sns.distplot(X['fixed.acidity']);

In [None]:
sns.distplot(X['alcohol']);

In [None]:
# Compute correlation matrix
corr = X.corr()

# Generate matrix for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.set(style="white")

# Setup matplotlib figure
f, ax = plt.subplots(figsize=(11,9))

# Generate diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, square=True, cmap=cmap, center=0, linewidths=0.5, cbar_kws={"shrink": .5});

## Split dataset into training and validation

In [None]:
# Test and training validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

##  Classifier modelling

### Using SVM

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.001, 0.01, 0.1, 1, 2, 5, 10, 20, 50, 100],
    'gamma': [0.01, 0.1, 0.5, 1]
}

svc = SVC(gamma='scale', random_state=8)
clf = GridSearchCV(svc, param_grid=parameters, cv=3)
clf.fit(X_train, y_train)

In [None]:
# Best estimator
clf.best_estimator_

In [None]:
print("Train: ", clf.score(X_train, y_train))
print("Test:  ", clf.score(X_test, y_test))

In [None]:
# Score versus C value
ax = sns.lineplot(x=clf.cv_results_['param_C'] ,y=clf.cv_results_['mean_test_score']);
ax.set(xlabel='C value', ylabel='Score')
plt.show()

In [None]:
# Score vs selected kernel
ax = sns.lineplot(x=clf.cv_results_['param_kernel'] ,y=clf.cv_results_['mean_test_score']);
ax.set(xlabel='Kernel', ylabel='Score')
plt.show()

In [None]:
# Score vs Gamma value
ax = sns.lineplot(x=clf.cv_results_['param_gamma'] ,y=clf.cv_results_['mean_test_score']);
ax.set(xlabel='Gamma', ylabel='Score')
plt.show()