In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/raw/wineq.csv')

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


In [5]:
df.describe().iloc[1:,:12]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [5]:
# great, no null values!
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64

In [6]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
Id                        int64
dtype: object

In [10]:
df.shape

(1143, 13)

In [6]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'Id'],
      dtype='object')

In [8]:
# we will train out model on this 6 output classes.
# And try to implement OOD detection.
df['quality'].unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [10]:
# dataset is imbalanced
df['quality'].value_counts()

5    483
6    462
7    143
4     33
8     16
3      6
Name: quality, dtype: int64

### Performing Oversampling using SMOTE

In [16]:
import pathlib
curr_dir = pathlib.Path()
home_dir = curr_dir.parent.parent.parent


data = pd.read_csv('E:/MLOps/WineQ/wineq/data/raw/wineq.csv')
df = data.copy()

In [19]:
# dataset is imbalanced, need to do oversampling
df['quality'].value_counts()

5    483
6    462
7    143
4     33
8     16
3      6
Name: quality, dtype: int64

In [20]:
# lets use SMOTE for oversampling
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)
X = df.drop(columns = ['quality'])
y = df['quality']

X_res, y_res = smote.fit_resample(X, y)

In [45]:
print('Before Sampling')
print('X: ', X.shape)
print('y: ', y.shape)

print('\nAfter Sampling')
print('X_res: ', X_res.shape)
print('y_res: ', y_res.shape)

Before Sampling
X:  (1143, 12)
y:  (1143,)

After Sampling
X_res:  (2898, 12)
y_res:  (2898,)


In [46]:
display(y.value_counts())
display(y_res.value_counts())

5    483
6    462
7    143
4     33
8     16
3      6
Name: quality, dtype: int64

5    483
6    483
7    483
4    483
8    483
3    483
Name: quality, dtype: int64

In [24]:
smote.get_params()

{'k_neighbors': 5,
 'n_jobs': None,
 'random_state': 42,
 'sampling_strategy': 'auto'}

In [25]:
smote.get_feature_names_out()

array(['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'Id'], dtype=object)

In [42]:
pd.options.display.float_format = '{:.2f}'.format
print('Before Oversampling')
print('Shape X: ', X.shape)
display(X.describe().iloc[1:, :-1])
print('-' * 140)
print('After Oversampling')
print('Shape X_Res: ', X_res.shape)
display(X_res.describe().iloc[1:, :-1])

Before Oversampling
Shape X:  (1143, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
mean,8.31,0.53,0.27,2.53,0.09,15.62,45.91,1.0,3.31,0.66,10.44
std,1.75,0.18,0.2,1.36,0.05,10.25,32.78,0.0,0.16,0.17,1.08
min,4.6,0.12,0.0,0.9,0.01,1.0,6.0,0.99,2.74,0.33,8.4
25%,7.1,0.39,0.09,1.9,0.07,7.0,21.0,1.0,3.21,0.55,9.5
50%,7.9,0.52,0.25,2.2,0.08,13.0,37.0,1.0,3.31,0.62,10.2
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,1.0,3.4,0.73,11.1
max,15.9,1.58,1.0,15.5,0.61,68.0,289.0,1.0,4.01,2.0,14.9


--------------------------------------------------------------------------------------------------------------------------------------------
After Oversampling
Shape X_Res:  (2898, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
mean,8.45,0.58,0.29,2.61,0.09,13.33,37.32,1.0,3.31,0.67,10.64
std,1.7,0.24,0.21,1.2,0.05,9.16,28.01,0.0,0.16,0.18,1.19
min,4.6,0.12,0.0,0.9,0.01,1.0,6.0,0.99,2.74,0.33,8.4
25%,7.2,0.39,0.08,1.93,0.07,6.0,17.41,1.0,3.21,0.56,9.69
50%,8.1,0.54,0.29,2.2,0.08,10.73,29.0,1.0,3.31,0.63,10.49
75%,9.5,0.71,0.46,2.76,0.09,17.32,47.89,1.0,3.41,0.74,11.5
max,15.9,1.58,1.0,15.5,0.61,68.0,289.0,1.0,4.01,2.0,14.9


In [53]:
# add both columns and return this data as op
X_res['quality'] = y_res

In [54]:
X_res

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Id,quality
0,7.40,0.70,0.00,1.90,0.08,11.00,34.00,1.00,3.51,0.56,9.40,0,5
1,7.80,0.88,0.00,2.60,0.10,25.00,67.00,1.00,3.20,0.68,9.80,1,5
2,7.80,0.76,0.04,2.30,0.09,15.00,54.00,1.00,3.26,0.65,9.80,2,5
3,11.20,0.28,0.56,1.90,0.07,17.00,60.00,1.00,3.16,0.58,9.80,3,6
4,7.40,0.70,0.00,1.90,0.08,11.00,34.00,1.00,3.51,0.56,9.40,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2893,10.70,0.35,0.53,2.60,0.07,5.00,16.00,1.00,3.15,0.65,11.00,496,8
2894,9.58,0.30,0.54,3.52,0.08,5.80,16.20,1.00,3.17,0.90,11.88,440,8
2895,7.69,0.39,0.34,1.93,0.06,11.86,23.06,0.99,3.27,0.74,11.44,1362,8
2896,11.02,0.49,0.60,3.98,0.08,5.53,17.60,1.00,3.19,0.67,12.28,475,8


In [55]:
X['quality'] = y

In [61]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Id,quality
0,7.40,0.70,0.00,1.90,0.08,11.00,34.00,1.00,3.51,0.56,9.40,0,5
1,7.80,0.88,0.00,2.60,0.10,25.00,67.00,1.00,3.20,0.68,9.80,1,5
2,7.80,0.76,0.04,2.30,0.09,15.00,54.00,1.00,3.26,0.65,9.80,2,5
3,11.20,0.28,0.56,1.90,0.07,17.00,60.00,1.00,3.16,0.58,9.80,3,6
4,7.40,0.70,0.00,1.90,0.08,11.00,34.00,1.00,3.51,0.56,9.40,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.30,0.51,0.13,2.30,0.08,29.00,40.00,1.00,3.42,0.75,11.00,1592,6
1139,6.80,0.62,0.08,1.90,0.07,28.00,38.00,1.00,3.42,0.82,9.50,1593,6
1140,6.20,0.60,0.08,2.00,0.09,32.00,44.00,0.99,3.45,0.58,10.50,1594,5
1141,5.90,0.55,0.10,2.20,0.06,39.00,51.00,1.00,3.52,0.76,11.20,1595,6


In [63]:
type(y)

pandas.core.series.Series

In [13]:
for i in df.columns :
     print(f"{i.replace(' ', '_')} : float")

fixed_acidity : float
volatile_acidity : float
citric_acid : float
residual_sugar : float
chlorides : float
free_sulfur_dioxide : float
total_sulfur_dioxide : float
density : float
pH : float
sulphates : float
alcohol : float
quality : float
Id : float


In [24]:
# insights on range of data we may expect as input
df.describe().iloc[[3,7], :-2]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9


In [25]:
df.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [29]:
x = dict()

col = df.columns
row = df.head(1)

for i in zip(col, row) :
     print(i[0], ' - ', i[1])

fixed acidity  -  fixed acidity
volatile acidity  -  volatile acidity
citric acid  -  citric acid
residual sugar  -  residual sugar
chlorides  -  chlorides
free sulfur dioxide  -  free sulfur dioxide
total sulfur dioxide  -  total sulfur dioxide
density  -  density
pH  -  pH
sulphates  -  sulphates
alcohol  -  alcohol
quality  -  quality
Id  -  Id


In [4]:
x = dict(df.iloc[1, :])