In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('../artifacts/data_ingestion/winequality-red.csv')

In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
df.shape

(1599, 12)

In [7]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [8]:
df['quality'].unique()

array([5, 6, 7, 4, 8, 3])

In [12]:
columns = df.columns
columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [16]:
np.array(df['fixed acidity'])

array([7.4, 7.8, 7.8, ..., 6.3, 5.9, 6. ], shape=(1599,))

In [13]:
for col in columns:
    print(col)

fixed acidity
volatile acidity
citric acid
residual sugar
chlorides
free sulfur dioxide
total sulfur dioxide
density
pH
sulphates
alcohol
quality


In [45]:
def compute_valid_input_ranges(df):
    input_value_range = {}
    for col in df.columns:
        data = df[col].values
        Q1 = np.percentile(data, 25)
        Q3 = np.percentile(data, 75)
        IQR = Q3 - Q1
        
        lower_bound = round(float(Q1 - 1.5 * IQR), 2)
        upper_bound = round(float(Q3 + 1.5 * IQR), 2)
        
        # Fix negative lower bounds for selected columns
        if col in ['citric acid', 'free sulfur dioxide', 'total sulfur dioxide']:
            lower_bound = float(max(lower_bound, df[col].min()))
        
        input_value_range[col] = [lower_bound, upper_bound]
        
    return input_value_range

In [46]:
compute_valid_input_ranges(df)

{'fixed acidity': [3.95, 12.35],
 'volatile acidity': [0.02, 1.02],
 'citric acid': [0.0, 0.91],
 'residual sugar': [0.85, 3.65],
 'chlorides': [0.04, 0.12],
 'free sulfur dioxide': [1.0, 42.0],
 'total sulfur dioxide': [6.0, 122.0],
 'density': [0.99, 1.0],
 'pH': [2.92, 3.68],
 'sulphates': [0.28, 1.0],
 'alcohol': [7.1, 13.5],
 'quality': [3.5, 7.5]}

These IQR ranges are based only on the dataset we have —but in the real world, pH or alcohol etc. can be slightly outside this range. How do we handle that?
IQR ranges are NOT a representation of the true physical limits of wine. They only reflect your dataset’s distribution.

Some values are negative (citric acid: -0.4, free sulfur dioxide: -14.0, total sulfur dioxide: -38.0). These don’t make sense physically (negative acidity or sulfur values aren’t valid). This happens because IQR method is purely statistical.

Combine IQR-based ranges + scientific constraints. That gives us robust yet realistic validation.

In [56]:
valid_input_ranges = {
    'fixed_acidity': [3.95, 12.35],
    'volatile_acidity': [0.02, 1.02],
    'citric_acid': [0.0, 0.91],
    'residual_sugar': [0.85, 3.65],
    'chlorides': [0.04, 0.12],
    'free_sulfur_dioxide': [1.0, 42.0],
    'total_sulfur_dioxide': [6.0, 122.0],
    'density': [0.99, 1.0],
    'pH': [2.92, 3.68],
    'sulphates': [0.28, 1.0],
    'alcohol': [7.1, 13.5]
}

In [57]:
inputs = {
    'fixed_acidity': 4,
    'volatile_acidity': 0.9,
    'citric_acid': 0.4,
    'residual_sugar': 2,
    'chlorides': 0.1,
    'free_sulfur_dioxide': 41,
    'total_sulfur_dioxide': 57,
    'density': 0.993,
    'pH': 0.83,
    'sulphates': 0.9,
    'alcohol': 7 
}

In [58]:
for feature, value in inputs.items():
    min_val, max_val = valid_input_ranges[feature]
    if value < min_val:
        inputs[feature] = min_val
    elif value > max_val:
        inputs[feature] = max_val