In [2]:
import pandas as pd
import numpy as np
from summarytools import dfSummary
import plotly.express as px 

import warnings 
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("wine_quality_classification.csv")
data.head()

Unnamed: 0,fixed_acidity,residual_sugar,alcohol,density,quality_label
0,9.3,6.4,13.6,1.0005,high
1,11.2,2.0,14.0,0.9912,medium
2,11.6,0.9,8.2,0.9935,low
3,12.9,6.6,12.7,1.0002,low
4,13.9,13.8,10.4,0.9942,medium


## 2. Data Quality Checks

In [5]:
dfSummary(data, is_collapsible=False)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,fixed_acidity [float64],Mean (sd) : 10.2 (3.5) min < med < max: 4.0 < 10.4 < 16.0 IQR (CV) : 6.0 (2.9),121 distinct values,,0 (0.0%)
2,residual_sugar [float64],Mean (sd) : 7.7 (4.2) min < med < max: 0.5 < 7.5 < 15.0 IQR (CV) : 7.4 (1.8),145 distinct values,,0 (0.0%)
3,alcohol [float64],Mean (sd) : 11.0 (1.8) min < med < max: 8.0 < 11.0 < 14.0 IQR (CV) : 3.1 (6.3),61 distinct values,,0 (0.0%)
4,density [float64],Mean (sd) : 1.0 (0.0) min < med < max: 1.0 < 1.0 < 1.0 IQR (CV) : 0.0 (232.0),151 distinct values,,0 (0.0%)
5,quality_label [object],1. medium 2. high 3. low,355 (35.5%) 343 (34.3%) 302 (30.2%),,0 (0.0%)


## 3. Exploring the data

In [6]:
# define numerical & categorical columns
numeric_features = [feature for feature in data.columns if data[feature].dtype != 'O']
categorical_features = [feature for feature in data.columns if data[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 4 numerical features : ['fixed_acidity', 'residual_sugar', 'alcohol', 'density']

We have 1 categorical features : ['quality_label']


In [8]:
# proportion of count data on categorical columns
print(data['quality_label'].value_counts(normalize=True) * 100)

medium    35.5
high      34.3
low       30.2
Name: quality_label, dtype: float64


#### Insights
- Our quality label is faily balanced in the dataset

In [17]:
import plotly.io as pio
pio.renderers.default = 'iframe'

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from scipy.stats import gaussian_kde

# Setup subplot grid: 4 rows x 3 columns
rows = 4
cols = 3
fig = make_subplots(rows=rows, cols=cols, subplot_titles=numeric_features)

# Loop through each numerical feature
for idx, feature in enumerate(numeric_features):
    row = idx // cols + 1
    col = idx % cols + 1
    
    # Compute KDE using scipy (as Plotly doesn't have built-in KDE)
    x = data[feature].dropna()
    kde = gaussian_kde(x)
    x_range = np.linspace(x.min(), x.max(), 200)
    y_kde = kde(x_range)

    # Add KDE line to subplot
    fig.add_trace(
        go.Scatter(x=x_range, y=y_kde, mode='lines', name=feature, line=dict(color='blue')),
        row=row, col=col
    )

# Layout settings
fig.update_layout(
    height=400, width=600,
    title_text="Univariate Analysis of Numerical Features",
    title_font=dict(size=20, family="Arial", color="black"),
    showlegend=False
)

fig.show()