In [1]:
from datasets import wine_red_dataset, wine_white_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

wine_red = wine_red_dataset().drop('ID', axis=1)
wine_red.dropna(how='all', inplace=True)

wine_white = wine_white_dataset()

# Modeling-Missing Values

Information from the exploration: Missing values in the pH (15 values) and fixed acidity (17 values) data

Fixed acidity is a missing at random value: In our case, missing fixed acidity values can be calculated from the citric acid values.     PH seem to be missing completely at random: There is no major correlation with other features

Fixed acidity missing values are calculated by a multiple imputation because the estimation can provide
more realistic standard errors. The multiple Imputation creates multiple data
with different estimations. The results of different models are
averaged or combined.

In [2]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from datasets import show_mvs

columns = ['fixed acidity','citric acid']
w_red_imp=wine_red[columns]
mvs = show_mvs(w_red_imp)

display(w_red_imp)

imp = IterativeImputer(max_iter=20)
transformed_x= imp.fit_transform(w_red_imp)
df_missing_v=pd.DataFrame(transformed_x, columns=columns)
display("Regression imputed fixed acidity: \n", df_missing_v.loc[mvs.index])
wine_red['fixed acidity'] = df_missing_v['fixed acidity']
show_mvs(wine_red)
print()

Unnamed: 0,fixed acidity,citric acid
22,,0.21
61,,0.49
97,,0.25
187,,0.1
274,,0.18
409,,0.49
410,,0.34
411,,0.35
412,,0.16
753,,0.1


Unnamed: 0,fixed acidity,citric acid
0,7.4,0.00
1,7.8,0.00
2,7.8,0.04
3,11.2,0.56
4,7.4,0.00
...,...,...
1592,6.2,0.08
1593,5.9,0.10
1594,6.3,0.13
1595,5.9,0.12


'Regression imputed fixed acidity: \n'

Unnamed: 0,fixed acidity,citric acid
22,7.956331,0.21
61,9.638865,0.49
97,8.196693,0.25
187,7.295336,0.1
274,7.77606,0.18
409,9.638865,0.49
410,8.737508,0.34
411,8.797598,0.35
412,7.655879,0.16
753,7.295336,0.1


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
11,7.5,0.5,0.36,6.1,0.07,0.53,17.0,102.0,1.0,,0.8,0.42,10.5,0.101,5.0
606,9.4,0.41,0.48,4.6,0.07,0.53,10.0,20.0,1.0,,0.79,0.8,12.2,0.088,7.0
607,8.8,0.48,0.41,3.3,0.09,0.53,26.0,52.0,1.0,,0.53,0.56,10.5,0.096,6.0
608,10.1,0.65,0.37,5.1,0.11,0.53,11.0,65.0,1.0,,0.64,0.21,10.4,0.105,6.0
948,8.9,0.12,0.45,1.8,0.08,0.53,10.0,21.0,1.0,,0.76,0.69,11.9,0.086,7.0
1162,8.5,0.32,0.42,2.3,0.08,0.53,12.0,19.0,0.99,,0.71,0.39,11.8,0.093,7.0
1163,9.0,0.79,0.24,1.7,0.08,0.53,10.0,21.0,1.0,,0.67,0.58,10.0,0.105,5.0
1164,9.0,0.79,0.24,1.7,0.08,0.53,10.0,21.0,1.0,,0.67,0.79,10.0,0.102,5.0
1369,6.6,0.61,0.0,1.6,0.07,0.53,4.0,8.0,0.99,,0.37,0.05,10.4,0.103,4.0
1370,8.7,0.78,0.51,1.7,0.42,0.53,12.0,66.0,1.0,,1.17,0.91,9.2,0.111,5.0





PH seem to be missing completely at random: There is no major correlation with other features, average imputation is used because it handle missling completely at randoms, ML imputation is not used because the algorithm needs a missing at random, no regression imputation is used because the exploration showed that there is no correlation with other features. The median is used for the average impuation because it's not affected by outliers.

In [3]:
import numpy as np
from sklearn.impute import SimpleImputer
np.set_printoptions(threshold=np.inf)

x= wine_red[['pH']]

#Average Imputation using strategy='median'
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
imp_med.fit(x)
SimpleImputer()
ph_new = imp_med.transform(x)
#display(ph_new)
wine_red['pH']=ph_new
display(wine_red.head(10))

display(wine_red.isna().sum())


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,7.4,0.7,0.0,1.9,0.08,0.53,11.0,34.0,1.0,3.51,0.56,0.86,9.4,0.109,5.0
1,7.8,0.88,0.0,2.6,0.1,0.53,25.0,67.0,1.0,3.2,0.68,0.56,9.8,0.107,5.0
2,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.0,3.26,0.65,0.47,9.8,0.106,5.0
3,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.0,3.16,0.58,0.33,9.8,0.111,6.0
4,7.4,0.7,0.0,1.9,0.08,0.53,11.0,34.0,1.0,3.51,0.56,0.91,9.4,0.107,5.0
5,7.4,0.66,0.0,1.8,0.08,0.53,13.0,40.0,1.0,3.51,0.56,0.52,9.4,0.109,5.0
6,7.9,0.6,0.06,1.6,0.07,0.53,15.0,59.0,1.0,3.3,0.46,0.92,9.4,0.108,5.0
7,7.3,0.65,0.0,1.2,0.07,0.53,15.0,21.0,0.99,3.39,0.47,0.89,10.0,0.104,7.0
8,7.8,0.58,0.02,2.0,0.07,0.53,9.0,18.0,1.0,3.36,0.57,0.1,9.5,0.108,7.0
9,7.5,0.5,0.36,6.1,0.07,0.53,17.0,102.0,1.0,3.35,0.8,0.57,10.5,0.1,5.0


fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
flavanoids              0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
magnesium               0
alcohol                 0
lightness               0
quality                 0
dtype: int64

# Feature selection

calculate information gain and vif after handling missing values. Compare it to results from exploration phase

In [4]:
from datasets import get_kbest_sorted
from sklearn.feature_selection import mutual_info_regression
x = wine_red.drop('quality', axis=1)
y = wine_red['quality']
get_kbest_sorted(mutual_info_regression, x, y, 25)

Unnamed: 0,score
alcohol,0.187007
lightness,0.130241
volatile acidity,0.117914
sulphates,0.111462
total sulfur dioxide,0.079038
citric acid,0.068521
fixed acidity,0.057839
residual sugar,0.035244
free sulfur dioxide,0.029417
density,0.02491


In [5]:
from datasets import get_vifs

x = wine_red.drop('quality', axis=1)
vif_factors = get_vifs(x)
display(vif_factors)
x = wine_red.drop('quality', axis=1).drop('lightness',axis=1)
vif_factors = get_vifs(x)
display(vif_factors)

fixed acidity                2.32
volatile acidity             1.74
citric acid                  3.14
residual sugar               1.13
chlorides                    1.39
flavanoids              139527.70
free sulfur dioxide          1.93
total sulfur dioxide         2.11
density                      1.65
pH                           1.00
sulphates                    1.36
magnesium                    1.00
alcohol                     10.70
lightness                    9.99
dtype: float64

fixed acidity                2.32
volatile acidity             1.74
citric acid                  3.14
residual sugar               1.13
chlorides                    1.39
flavanoids              132477.09
free sulfur dioxide          1.93
total sulfur dioxide         2.10
density                      1.64
pH                           1.00
sulphates                    1.35
magnesium                    1.00
alcohol                      1.68
dtype: float64

missing values don't seem to have a big effect on information gain and vif

In [4]:
from datasets import get_logist_regression_kfold_score

X = wine_red.drop('quality', axis=1)
y = wine_red['quality']
score_before = get_logist_regression_kfold_score(X,y)
print('Score before: ', score_before)

selected_features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','magnesium','alcohol','lightness','quality']
selected_features = ['fixed acidity','citric acid','chlorides','total sulfur dioxide','sulphates','alcohol','quality']
wine_red_selected = wine_red[selected_features]

X = wine_red_selected.drop('quality', axis=1)
y = wine_red_selected['quality']

score_after = get_logist_regression_kfold_score(X,y)
print('Score after: ', score_after)

Score before:  0.5535378835316218
Score after:  0.5284909204758923


Compare result to PCA 

In [6]:
import numpy as np
import pandas as pd
from datasets import get_logist_regression_kfold_score
from sklearn import decomposition

#Perform PCA with 3 dimensions
num_components=3
pca = decomposition.PCA(n_components=2)
#Only use features
x=wine_red.drop('quality', axis=1)
pca.fit(x)
x_tran = pca.transform(x)

X = wine_red.drop('quality', axis=1)
y = wine_red['quality']
score_before = get_logist_regression_score(X,y)
print('Score before: ', score_before)
score_after = get_logist_regression_score(x_tran,y)
print('Score after: ', score_after)

Score before:  0.5833333333333334
Score after:  0.48125


# Outliers

Outlier Detection

In [21]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import IsolationForest
from datasets import get_logist_regression_score, show_mvs

display(wine_red.shape)
X = wine_red.drop('quality', axis=1)
y = wine_red['quality']

iso = IsolationForest(contamination=0.05)
y_out = iso.fit_predict(X)

score_before = get_logist_regression_score(X,y)
print('Score before: ', score_before)

# build a mask to select all rows that are not outliers (inlier=1, outlier=-1)
mask = y_out != -1
X_inlier, y_inlier = X[mask], y[mask]
X_outliers, y_outliers = X[~mask], y[~mask]

score_after = get_logist_regression_score(X_inlier,y_inlier)
print('Score after: ', score_after)

# Inliers vs. Outliers
print("Inliers: ",X_inlier.shape[0],"Outliers:",X_outliers.shape)
display('Outliers data:', X_outliers)
display('Inliers data:', X_inlier)

# wine_red = X_inlier
# wine_red['quality'] = y_inlier

(1597, 15)

Score before:  0.5833333333333334
Score after:  0.6293859649122807
Inliers:  1517 Outliers: (80, 14)


'Outliers data:'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness
14,8.9,0.62,0.18,3.8,0.18,0.53,52.0,145.0,1.00,3.16,0.88,0.90,9.2,0.111
15,8.9,0.62,0.19,3.9,0.17,0.53,51.0,148.0,1.00,3.17,0.93,0.91,9.2,0.115
38,5.7,1.13,0.09,1.5,0.17,0.53,7.0,19.0,0.99,3.50,0.48,0.94,9.8,0.107
45,4.6,0.52,0.15,2.1,0.05,0.53,8.0,65.0,0.99,3.90,0.56,0.94,13.1,0.084
81,7.8,0.43,0.70,1.9,0.46,0.53,22.0,67.0,1.00,3.13,1.28,0.27,9.4,0.109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1476,9.9,0.50,0.50,13.8,0.21,0.53,48.0,82.0,1.00,3.16,0.75,0.26,8.8,0.122
1477,5.3,0.47,0.11,2.2,0.05,0.53,16.0,89.0,0.99,3.54,0.88,0.90,13.6,0.081
1558,6.9,0.63,0.33,6.7,0.24,0.53,66.0,115.0,1.00,3.22,0.56,0.22,9.5,0.112
1570,6.4,0.36,0.53,2.2,0.23,0.53,19.0,35.0,0.99,3.37,0.93,0.88,12.4,0.088


'Inliers data:'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness
0,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.86,9.4,0.109
1,7.8,0.88,0.00,2.6,0.10,0.53,25.0,67.0,1.00,3.20,0.68,0.56,9.8,0.107
2,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.00,3.26,0.65,0.47,9.8,0.106
3,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.00,3.16,0.58,0.33,9.8,0.111
4,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.91,9.4,0.107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,6.2,0.60,0.08,2.0,0.09,0.53,32.0,44.0,0.99,3.31,0.58,0.98,10.5,0.102
1593,5.9,0.55,0.10,2.2,0.06,0.53,39.0,51.0,1.00,3.52,0.76,0.82,11.2,0.090
1594,6.3,0.51,0.13,2.3,0.08,0.53,29.0,40.0,1.00,3.42,0.75,0.71,11.0,0.095
1595,5.9,0.65,0.12,2.0,0.08,0.53,32.0,44.0,1.00,3.57,0.71,0.33,10.2,0.104


https://www.pluralsight.com/guides/cleaning-up-data-from-outliers

# Noise Handling

Our assumption: The found noise data are feature noises --> jedes feature nach diesem Vorgehen bearbeiten? Oder die 58 einfach Instance löschen oder???

In [23]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_predict

# Define 3 classifiers
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

# define data
x = wine_red.drop('quality', axis=1)
y = wine_red['quality']

# merge 3 classifiers into one voting classifier
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

# train voting classifier with k-fold method
y_pred = cross_val_predict(eclf1, x, y, cv=3)

# save predictions, original quality and correct prediction boolean in data frame
result = pd.DataFrame(y_pred, columns=['Prediction'], index=x.index)
result['quality'] = y
# mindestens 2 Abweichung
delta_treshold = 2
result['Correct Prediction']= abs(result['Prediction'] - result['quality']) < delta_treshold

display(result.shape)
display(result)

# select all incorrect predicted data
print('False predictions')
delta_result = result[result['Correct Prediction'] == False]
display(delta_result.shape)
display(delta_result)

display('Outliers data', X_outliers)

outlier_to_delete = []

for outlier in X_outliers.index:
    if not outlier in delta_result.index:
        outlier_to_delete.append(outlier)
display(outlier_to_delete)
wine_red.drop(index=outlier_to_delete, inplace=True)
display(wine_red)
# handle noise

(1597, 3)

Unnamed: 0,Prediction,quality,Correct Prediction
0,5.0,5.0,True
1,5.0,5.0,True
2,5.0,5.0,True
3,5.0,6.0,True
4,5.0,5.0,True
...,...,...,...
1592,6.0,5.0,True
1593,6.0,6.0,True
1594,6.0,6.0,True
1595,5.0,5.0,True


False predictions


(88, 3)

Unnamed: 0,Prediction,quality,Correct Prediction
7,5.0,7.0,False
8,5.0,7.0,False
16,5.0,7.0,False
36,4.0,6.0,False
37,5.0,7.0,False
...,...,...,...
1478,5.0,3.0,False
1481,7.0,5.0,False
1505,5.0,3.0,False
1537,4.0,6.0,False


'Outliers data'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness
14,8.9,0.62,0.18,3.8,0.18,0.53,52.0,145.0,1.00,3.16,0.88,0.90,9.2,0.111
15,8.9,0.62,0.19,3.9,0.17,0.53,51.0,148.0,1.00,3.17,0.93,0.91,9.2,0.115
38,5.7,1.13,0.09,1.5,0.17,0.53,7.0,19.0,0.99,3.50,0.48,0.94,9.8,0.107
45,4.6,0.52,0.15,2.1,0.05,0.53,8.0,65.0,0.99,3.90,0.56,0.94,13.1,0.084
81,7.8,0.43,0.70,1.9,0.46,0.53,22.0,67.0,1.00,3.13,1.28,0.27,9.4,0.109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1476,9.9,0.50,0.50,13.8,0.21,0.53,48.0,82.0,1.00,3.16,0.75,0.26,8.8,0.122
1477,5.3,0.47,0.11,2.2,0.05,0.53,16.0,89.0,0.99,3.54,0.88,0.90,13.6,0.081
1558,6.9,0.63,0.33,6.7,0.24,0.53,66.0,115.0,1.00,3.22,0.56,0.22,9.5,0.112
1570,6.4,0.36,0.53,2.2,0.23,0.53,19.0,35.0,0.99,3.37,0.93,0.88,12.4,0.088


[14,
 15,
 38,
 81,
 86,
 91,
 92,
 94,
 106,
 142,
 144,
 151,
 169,
 198,
 258,
 353,
 354,
 378,
 395,
 396,
 400,
 415,
 451,
 501,
 502,
 515,
 553,
 591,
 649,
 652,
 692,
 695,
 723,
 802,
 821,
 836,
 837,
 941,
 982,
 1017,
 1018,
 1026,
 1068,
 1098,
 1114,
 1131,
 1228,
 1234,
 1235,
 1244,
 1260,
 1270,
 1316,
 1319,
 1321,
 1370,
 1372,
 1434,
 1435,
 1474,
 1475,
 1476,
 1477,
 1558,
 1570,
 1574]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.86,9.4,0.109,5.0
1,7.8,0.88,0.00,2.6,0.10,0.53,25.0,67.0,1.00,3.20,0.68,0.56,9.8,0.107,5.0
2,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.00,3.26,0.65,0.47,9.8,0.106,5.0
3,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.00,3.16,0.58,0.33,9.8,0.111,6.0
4,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.91,9.4,0.107,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,6.2,0.60,0.08,2.0,0.09,0.53,32.0,44.0,0.99,3.31,0.58,0.98,10.5,0.102,5.0
1593,5.9,0.55,0.10,2.2,0.06,0.53,39.0,51.0,1.00,3.52,0.76,0.82,11.2,0.090,6.0
1594,6.3,0.51,0.13,2.3,0.08,0.53,29.0,40.0,1.00,3.42,0.75,0.71,11.0,0.095,6.0
1595,5.9,0.65,0.12,2.0,0.08,0.53,32.0,44.0,1.00,3.57,0.71,0.33,10.2,0.104,5.0


In [24]:
pd.set_option('display.max_rows', None)
display(wine_red)
pd.set_option('display.max_rows', 32)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,7.4,0.7,0.0,1.9,0.08,0.53,11.0,34.0,1.0,3.51,0.56,0.86,9.4,0.109,5.0
1,7.8,0.88,0.0,2.6,0.1,0.53,25.0,67.0,1.0,3.2,0.68,0.56,9.8,0.107,5.0
2,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.0,3.26,0.65,0.47,9.8,0.106,5.0
3,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.0,3.16,0.58,0.33,9.8,0.111,6.0
4,7.4,0.7,0.0,1.9,0.08,0.53,11.0,34.0,1.0,3.51,0.56,0.91,9.4,0.107,5.0
5,7.4,0.66,0.0,1.8,0.08,0.53,13.0,40.0,1.0,3.51,0.56,0.52,9.4,0.109,5.0
6,7.9,0.6,0.06,1.6,0.07,0.53,15.0,59.0,1.0,3.3,0.46,0.92,9.4,0.108,5.0
7,7.3,0.65,0.0,1.2,0.07,0.53,15.0,21.0,0.99,3.39,0.47,0.89,10.0,0.104,7.0
8,7.8,0.58,0.02,2.0,0.07,0.53,9.0,18.0,1.0,3.36,0.57,0.1,9.5,0.108,7.0
9,7.5,0.5,0.36,6.1,0.07,0.53,17.0,102.0,1.0,3.35,0.8,0.57,10.5,0.1,5.0


In [62]:
from sklearn.preprocessing import MinMaxScaler
# normalize data without noise as a reference value
scaler = MinMaxScaler()
without_noise = wine_red.drop(index=delta_result.index)
# calculate medians for features of the normalized data without noise
medians_all = pd.DataFrame(scaler.fit_transform(without_noise), columns = without_noise.columns, index=without_noise.index).describe().loc['50%']

# set minimum threshold for noise identification as difference between value and median
threshold = 0.5

# normalize noise data for comparison with median
scaler = MinMaxScaler()
noise_data = wine_red.loc[delta_result.index]
noise_data_normalized = pd.DataFrame(scaler.fit_transform(noise_data), columns=noise_data.columns, index=noise_data.index)
for row in noise_data_normalized.index:
    for column in noise_data_normalized.columns:
        col_row_value = noise_data_normalized[column][row]
        if abs(medians_all.loc[column] - col_row_value) > threshold:
            noise_data_normalized.at[row, column] = 1
        else:
            noise_data_normalized.at[row, column] = 0

# display sum of noisy identified data by features
display(noise_data_normalized.sum())

# display noisy features
display(noise_data_normalized)

fixed acidity            3.0
volatile acidity         1.0
citric acid             17.0
residual sugar           5.0
chlorides                2.0
flavanoids               0.0
free sulfur dioxide      7.0
total sulfur dioxide     2.0
density                 26.0
pH                      21.0
sulphates                4.0
magnesium                0.0
alcohol                  8.0
lightness                2.0
quality                  9.0
dtype: float64

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


# Feature Scaling

In [134]:
from datasets import normalize_feature

wine_red = normalize_feature(wine_red, 'total sulfur dioxide')
display(wine_red)

Unnamed: 0,alcohol,sulphates,citric acid,volatile acidity,total sulfur dioxide,quality
0,9.4,0.56,0.00,0.70,0.181818,5.0
1,9.8,0.68,0.00,0.88,0.396104,5.0
2,9.8,0.65,0.04,0.76,0.311688,5.0
3,9.8,0.58,0.56,0.28,0.350649,6.0
4,9.4,0.56,0.00,0.70,0.181818,5.0
...,...,...,...,...,...,...
1592,10.5,0.58,0.08,0.60,0.246753,5.0
1593,11.2,0.76,0.10,0.55,0.292208,6.0
1594,11.0,0.75,0.13,0.51,0.220779,6.0
1595,10.2,0.71,0.12,0.65,0.246753,5.0
