In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [None]:

data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
data.info()

In [None]:
data.head(10)

In [None]:
feature_list = data.columns[:-1].values
label = [data.columns[-1]]
print ("Feature list:", feature_list)
print ("Label:", label)

# **Data Statistics**

Total entries: 1599 
There are total 12 columns: 11
features + 1 label
Label column: quality
Features: [fixed acidity, volitile
acidity, citric acid, residual sugar,
cholrides, free sulphur dioxide,
total sulphur dioxide, density, pH,
sulphates, alcohol]
All columns are numeric (float64) and the
label is an integer.


In [None]:
data.describe()

In [None]:
data['quality'].value_counts(ascending=True)

Average quality wines are the highest in number while both low quality and high quality wines are rare

# **Plotting histogram to visualise the data**

In [None]:
sns.set()
data.quality.hist()
plt.xlabel('Wine Quality')
plt.ylabel('Count')


sns.set()
data.quality.hist()
plt.xlabel('Wine Quality')
plt.ylabel('Count')

# **Dynamically add subplots **

In [None]:

sns.set_theme()


fig, axs = plt.subplots(2, 3, figsize=(15, 8))
sns.histplot(data=data, x="total sulfur dioxide", kde=True, ax=axs[0, 0])
sns.histplot(data=data, x="density", kde=True,ax=axs[0, 1])
sns.histplot(data=data, x="pH", kde=True,  ax=axs[0, 2])
sns.histplot(data=data, x="sulphates", kde=True,  ax=axs[1, 0])
sns.histplot(data=data, x="alcohol", kde=True, ax=axs[1, 1])
sns.histplot(data=data, x="alcohol", kde=True, ax=axs[1, 2])
plt.xlabel('')


plt.show()

A few observations based on these plots:
1. Features are at different scales.
2. Features have different distributions

>  # **Splitting the data into training and test set**

# * *Splitting the dataset using a function* *

In [None]:
def split_train_test(data, test_ratio):
    # set the random seed.
    np.random.seed(42)
    # shuffle the dataset
    shuffled_indices = np.random.permutation(len(data))
    # calculate the size of the test set
    test_set_size = int(len(data) * test_ratio)
    # split dataset to get training and test sets.
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(data, 0.2)

# * *Splitting the data using scikit learn functions* *

# **Data cleaning and Preparation**

In [None]:
#Random sampling

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
#Stratified sampling
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["quality"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

# * *Comaparing stratified vs random sampling with overall distribution* *

In [None]:
strat_dist = strat_test_set["quality"].value_counts() / len(strat_test_set)

In [None]:
overall_dist = data["quality"].value_counts() / len(data)

In [None]:
dist_comparison = pd.DataFrame({'overall': overall_dist, 'stratified': strat_dist})
dist_comparison['random_dist'] = test_set["quality"].value_counts() / len(test_set)
dist_comparison['diff(s-o)'] = dist_comparison['stratified'] - dist_comparison['overall']
dist_comparison['diff(r-o)'] = dist_comparison['random_dist'] - dist_comparison['overall']
dist_comparison['diff(s-o)_pct'] = 100*(dist_comparison['diff(s-o)']/dist_comparison['overall'])
dist_comparison['diff(r-o)_pct'] = 100*(dist_comparison['diff(r-o)']/dist_comparison['overall'])


In [None]:
dist_comparison

> *Thus we can see that stratified sampling gives us test distribution closer to the overall distribution than the random sampling.*