In [18]:
from datasets import wine_red_dataset, wine_white_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

wine_red = wine_red_dataset().drop('ID', axis=1)
wine_red.dropna(how='all', inplace=True)

wine_white = wine_white_dataset()

# Modeling-Missing Values

Information from the exploration: Missing values in the pH (15 values) and fixed acidity (17 values) data

Fixed acidity is a missing at random value: In our case, missing fixed acidity values can be calculated from the citric acid values.     PH seem to be missing completely at random: There is no major correlation with other features

Fixed acidity missing values are calculated by a multiple imputation because the estimation can provide
more realistic standard errors. The multiple Imputation creates multiple data
with different estimations. The results of different models are
averaged or combined.

In [19]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from datasets import show_mvs

columns = ['fixed acidity','citric acid']
w_red_imp=wine_red[columns]
mvs = show_mvs(w_red_imp)

display(w_red_imp)

imp = IterativeImputer(max_iter=20)
transformed_x= imp.fit_transform(w_red_imp)
df_missing_v=pd.DataFrame(transformed_x, columns=columns)
display("Regression imputed fixed acidity: \n", df_missing_v.loc[mvs.index])
wine_red['fixed acidity'] = df_missing_v['fixed acidity']
show_mvs(wine_red)
print()

Unnamed: 0,fixed acidity,citric acid
22,,0.21
61,,0.49
97,,0.25
187,,0.1
274,,0.18
409,,0.49
410,,0.34
411,,0.35
412,,0.16
753,,0.1


Unnamed: 0,fixed acidity,citric acid
0,7.4,0.00
1,7.8,0.00
2,7.8,0.04
3,11.2,0.56
4,7.4,0.00
...,...,...
1592,6.2,0.08
1593,5.9,0.10
1594,6.3,0.13
1595,5.9,0.12


'Regression imputed fixed acidity: \n'

Unnamed: 0,fixed acidity,citric acid
22,7.956331,0.21
61,9.638865,0.49
97,8.196693,0.25
187,7.295336,0.1
274,7.77606,0.18
409,9.638865,0.49
410,8.737508,0.34
411,8.797598,0.35
412,7.655879,0.16
753,7.295336,0.1


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
11,7.5,0.5,0.36,6.1,0.07,0.53,17.0,102.0,1.0,,0.8,0.42,10.5,0.101,5.0
606,9.4,0.41,0.48,4.6,0.07,0.53,10.0,20.0,1.0,,0.79,0.8,12.2,0.088,7.0
607,8.8,0.48,0.41,3.3,0.09,0.53,26.0,52.0,1.0,,0.53,0.56,10.5,0.096,6.0
608,10.1,0.65,0.37,5.1,0.11,0.53,11.0,65.0,1.0,,0.64,0.21,10.4,0.105,6.0
948,8.9,0.12,0.45,1.8,0.08,0.53,10.0,21.0,1.0,,0.76,0.69,11.9,0.086,7.0
1162,8.5,0.32,0.42,2.3,0.08,0.53,12.0,19.0,0.99,,0.71,0.39,11.8,0.093,7.0
1163,9.0,0.79,0.24,1.7,0.08,0.53,10.0,21.0,1.0,,0.67,0.58,10.0,0.105,5.0
1164,9.0,0.79,0.24,1.7,0.08,0.53,10.0,21.0,1.0,,0.67,0.79,10.0,0.102,5.0
1369,6.6,0.61,0.0,1.6,0.07,0.53,4.0,8.0,0.99,,0.37,0.05,10.4,0.103,4.0
1370,8.7,0.78,0.51,1.7,0.42,0.53,12.0,66.0,1.0,,1.17,0.91,9.2,0.111,5.0





PH seem to be missing completely at random: There is no major correlation with other features, average imputation is used because it handle missling completely at randoms, ML imputation is not used because the algorithm needs a missing at random, no regression imputation is used because the exploration showed that there is no correlation with other features. The median is used for the average impuation because it's not affected by outliers.

In [20]:
import numpy as np
from sklearn.impute import SimpleImputer
np.set_printoptions(threshold=np.inf)

x= wine_red[['pH']]

#Average Imputation using strategy='median'
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
imp_med.fit(x)
SimpleImputer()
ph_new = imp_med.transform(x)
#display(ph_new)
wine_red['pH']=ph_new
display(wine_red.head(10))

display(wine_red.isna().sum())


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,7.4,0.7,0.0,1.9,0.08,0.53,11.0,34.0,1.0,3.51,0.56,0.86,9.4,0.109,5.0
1,7.8,0.88,0.0,2.6,0.1,0.53,25.0,67.0,1.0,3.2,0.68,0.56,9.8,0.107,5.0
2,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.0,3.26,0.65,0.47,9.8,0.106,5.0
3,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.0,3.16,0.58,0.33,9.8,0.111,6.0
4,7.4,0.7,0.0,1.9,0.08,0.53,11.0,34.0,1.0,3.51,0.56,0.91,9.4,0.107,5.0
5,7.4,0.66,0.0,1.8,0.08,0.53,13.0,40.0,1.0,3.51,0.56,0.52,9.4,0.109,5.0
6,7.9,0.6,0.06,1.6,0.07,0.53,15.0,59.0,1.0,3.3,0.46,0.92,9.4,0.108,5.0
7,7.3,0.65,0.0,1.2,0.07,0.53,15.0,21.0,0.99,3.39,0.47,0.89,10.0,0.104,7.0
8,7.8,0.58,0.02,2.0,0.07,0.53,9.0,18.0,1.0,3.36,0.57,0.1,9.5,0.108,7.0
9,7.5,0.5,0.36,6.1,0.07,0.53,17.0,102.0,1.0,3.35,0.8,0.57,10.5,0.1,5.0


fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
flavanoids              0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
magnesium               0
alcohol                 0
lightness               0
quality                 0
dtype: int64

# Outliers

Outlier Detection

In [21]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import IsolationForest
from datasets import get_logist_regression_score, show_mvs

display(wine_red.shape)
X = wine_red.drop('quality', axis=1)
y = wine_red['quality']

iso = IsolationForest(contamination=0.05)
y_out = iso.fit_predict(X)

score_before = get_logist_regression_score(X,y)
print('Score before: ', score_before)

# build a mask to select all rows that are not outliers (inlier=1, outlier=-1)
mask = y_out != -1
X_inlier, y_inlier = X[mask], y[mask]
X_outliers, y_outliers = X[~mask], y[~mask]

score_after = get_logist_regression_score(X_inlier,y_inlier)
print('Score after: ', score_after)

# Inliers vs. Outliers
print("Inliers: ",X_inlier.shape[0],"Outliers:",X_outliers.shape)
display('Outliers data:', X_outliers)
display('Inliers data:', X_inlier)

# wine_red = X_inlier
# wine_red['quality'] = y_inlier

(1597, 15)

Score before:  -0.03980582548197842
Score after:  0.3368836467586014
Inliers:  1517 Outliers: (80, 14)


'Outliers data:'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness
14,8.9,0.62,0.18,3.8,0.18,0.53,52.0,145.0,1.00,3.16,0.88,0.90,9.20,0.111
15,8.9,0.62,0.19,3.9,0.17,0.53,51.0,148.0,1.00,3.17,0.93,0.91,9.20,0.115
19,7.9,0.32,0.51,1.8,0.34,0.53,17.0,56.0,1.00,3.04,1.08,0.08,9.20,0.117
38,5.7,1.13,0.09,1.5,0.17,0.53,7.0,19.0,0.99,3.50,0.48,0.94,9.80,0.107
45,4.6,0.52,0.15,2.1,0.05,0.53,8.0,65.0,0.99,3.90,0.56,0.94,13.10,0.084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,5.3,0.47,0.11,2.2,0.05,0.53,16.0,89.0,0.99,3.54,0.88,0.71,13.57,0.078
1476,9.9,0.50,0.50,13.8,0.21,0.53,48.0,82.0,1.00,3.16,0.75,0.26,8.80,0.122
1477,5.3,0.47,0.11,2.2,0.05,0.53,16.0,89.0,0.99,3.54,0.88,0.90,13.60,0.081
1558,6.9,0.63,0.33,6.7,0.24,0.53,66.0,115.0,1.00,3.22,0.56,0.22,9.50,0.112


'Inliers data:'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness
0,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.86,9.4,0.109
1,7.8,0.88,0.00,2.6,0.10,0.53,25.0,67.0,1.00,3.20,0.68,0.56,9.8,0.107
2,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.00,3.26,0.65,0.47,9.8,0.106
3,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.00,3.16,0.58,0.33,9.8,0.111
4,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.91,9.4,0.107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,6.2,0.60,0.08,2.0,0.09,0.53,32.0,44.0,0.99,3.31,0.58,0.98,10.5,0.102
1593,5.9,0.55,0.10,2.2,0.06,0.53,39.0,51.0,1.00,3.52,0.76,0.82,11.2,0.090
1594,6.3,0.51,0.13,2.3,0.08,0.53,29.0,40.0,1.00,3.42,0.75,0.71,11.0,0.095
1595,5.9,0.65,0.12,2.0,0.08,0.53,32.0,44.0,1.00,3.57,0.71,0.33,10.2,0.104


https://www.pluralsight.com/guides/cleaning-up-data-from-outliers

# Noise Handling

Our assumption: The found noise data are feature noises --> jedes feature nach diesem Vorgehen bearbeiten? Oder die 58 einfach Instance löschen oder???

In [22]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_predict

# Define 3 classifiers
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

# define data
x = wine_red.drop('quality', axis=1)
y = wine_red['quality']

# merge 3 classifiers into one voting classifier
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

# train voting classifier with k-fold method
y_pred = cross_val_predict(eclf1, x, y, cv=3)

# save predictions, original quality and correct prediction boolean in data frame
result = pd.DataFrame(y_pred, columns=['Prediction'], index=x.index)
result['quality'] = y
# mindestens 2 Abweichung
delta_treshold = 2
result['Correct Prediction']= abs(result['Prediction'] - result['quality']) < delta_treshold

display(result.shape)
display(result)

# select all incorrect predicted data
print('False predictions')
delta_result = result[result['Correct Prediction'] == False]
display(delta_result.shape)
display(delta_result)

display('Outliers data', X_outliers)

outlier_to_delete = []

for outlier in X_outliers.index:
    if not outlier in delta_result.index:
        outlier_to_delete.append(outlier)
display(outlier_to_delete)
wine_red.drop(index=outlier_to_delete, inplace=True)
display(wine_red)
# handle noise

(1597, 3)

Unnamed: 0,Prediction,quality,Correct Prediction
0,5.0,5.0,True
1,5.0,5.0,True
2,5.0,5.0,True
3,5.0,6.0,True
4,5.0,5.0,True
...,...,...,...
1592,6.0,5.0,True
1593,6.0,6.0,True
1594,6.0,6.0,True
1595,5.0,5.0,True


False predictions


(88, 3)

Unnamed: 0,Prediction,quality,Correct Prediction
7,5.0,7.0,False
8,5.0,7.0,False
16,5.0,7.0,False
36,4.0,6.0,False
37,5.0,7.0,False
...,...,...,...
1478,5.0,3.0,False
1481,7.0,5.0,False
1505,5.0,3.0,False
1537,4.0,6.0,False


'Outliers data'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness
14,8.9,0.62,0.18,3.8,0.18,0.53,52.0,145.0,1.00,3.16,0.88,0.90,9.20,0.111
15,8.9,0.62,0.19,3.9,0.17,0.53,51.0,148.0,1.00,3.17,0.93,0.91,9.20,0.115
19,7.9,0.32,0.51,1.8,0.34,0.53,17.0,56.0,1.00,3.04,1.08,0.08,9.20,0.117
38,5.7,1.13,0.09,1.5,0.17,0.53,7.0,19.0,0.99,3.50,0.48,0.94,9.80,0.107
45,4.6,0.52,0.15,2.1,0.05,0.53,8.0,65.0,0.99,3.90,0.56,0.94,13.10,0.084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,5.3,0.47,0.11,2.2,0.05,0.53,16.0,89.0,0.99,3.54,0.88,0.71,13.57,0.078
1476,9.9,0.50,0.50,13.8,0.21,0.53,48.0,82.0,1.00,3.16,0.75,0.26,8.80,0.122
1477,5.3,0.47,0.11,2.2,0.05,0.53,16.0,89.0,0.99,3.54,0.88,0.90,13.60,0.081
1558,6.9,0.63,0.33,6.7,0.24,0.53,66.0,115.0,1.00,3.22,0.56,0.22,9.50,0.112


[14,
 15,
 19,
 38,
 81,
 94,
 95,
 106,
 132,
 142,
 144,
 151,
 169,
 198,
 226,
 258,
 353,
 354,
 378,
 396,
 400,
 415,
 451,
 467,
 501,
 502,
 515,
 553,
 649,
 652,
 672,
 684,
 692,
 695,
 802,
 821,
 836,
 837,
 941,
 982,
 997,
 1017,
 1018,
 1051,
 1090,
 1098,
 1114,
 1131,
 1157,
 1178,
 1228,
 1235,
 1244,
 1256,
 1260,
 1270,
 1316,
 1319,
 1321,
 1370,
 1372,
 1434,
 1435,
 1474,
 1475,
 1476,
 1477,
 1558,
 1574]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.86,9.4,0.109,5.0
1,7.8,0.88,0.00,2.6,0.10,0.53,25.0,67.0,1.00,3.20,0.68,0.56,9.8,0.107,5.0
2,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.00,3.26,0.65,0.47,9.8,0.106,5.0
3,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.00,3.16,0.58,0.33,9.8,0.111,6.0
4,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.91,9.4,0.107,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,6.2,0.60,0.08,2.0,0.09,0.53,32.0,44.0,0.99,3.31,0.58,0.98,10.5,0.102,5.0
1593,5.9,0.55,0.10,2.2,0.06,0.53,39.0,51.0,1.00,3.52,0.76,0.82,11.2,0.090,6.0
1594,6.3,0.51,0.13,2.3,0.08,0.53,29.0,40.0,1.00,3.42,0.75,0.71,11.0,0.095,6.0
1595,5.9,0.65,0.12,2.0,0.08,0.53,32.0,44.0,1.00,3.57,0.71,0.33,10.2,0.104,5.0


In [23]:
# pd.set_option('display.max_rows', None)
display(wine_red)
# pd.set_option('display.max_rows', 32)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.86,9.4,0.109,5.0
1,7.8,0.88,0.00,2.6,0.10,0.53,25.0,67.0,1.00,3.20,0.68,0.56,9.8,0.107,5.0
2,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.00,3.26,0.65,0.47,9.8,0.106,5.0
3,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.00,3.16,0.58,0.33,9.8,0.111,6.0
4,7.4,0.70,0.00,1.9,0.08,0.53,11.0,34.0,1.00,3.51,0.56,0.91,9.4,0.107,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,6.2,0.60,0.08,2.0,0.09,0.53,32.0,44.0,0.99,3.31,0.58,0.98,10.5,0.102,5.0
1593,5.9,0.55,0.10,2.2,0.06,0.53,39.0,51.0,1.00,3.52,0.76,0.82,11.2,0.090,6.0
1594,6.3,0.51,0.13,2.3,0.08,0.53,29.0,40.0,1.00,3.42,0.75,0.71,11.0,0.095,6.0
1595,5.9,0.65,0.12,2.0,0.08,0.53,32.0,44.0,1.00,3.57,0.71,0.33,10.2,0.104,5.0


In [24]:
from sklearn.preprocessing import MinMaxScaler
# normalize data without noise as a reference value
scaler = MinMaxScaler()
without_noise = wine_red.drop('quality', axis=1).drop(index=delta_result.index)
# calculate medians for features of the normalized data without noise
medians_all = pd.DataFrame(scaler.fit_transform(without_noise), columns = without_noise.columns, index=without_noise.index).describe().loc['50%']
medians_all_data = without_noise.describe().loc['50%']

# set minimum threshold for noise identification as difference between value and median
threshold = 0.5

# normalize noise data for comparison with median
scaler = MinMaxScaler()
noise_data = wine_red.loc[delta_result.index].drop('quality', axis=1)
noise_data_normalized = pd.DataFrame(scaler.fit_transform(noise_data), columns=noise_data.columns, index=noise_data.index)
for row in noise_data_normalized.index:
    for column in noise_data_normalized.columns:
        col_row_value = noise_data_normalized[column][row]
        if abs(medians_all.loc[column] - col_row_value) > threshold:
            noise_data_normalized.at[row, column] = medians_all_data.loc[column]
        else:
            noise_data_normalized.at[row, column] = noise_data.at[row, column]
# drop all data that are no noise
noise_data_normalized = noise_data_normalized.dropna(how='all')

# display sum of noisy identified data by features
display(noise_data_normalized.count())

# display noisy features
display(noise_data_normalized)

noise_data_normalized['quality'] = wine_red.loc[delta_result.index]['quality']

fixed acidity           88
volatile acidity        88
citric acid             88
residual sugar          88
chlorides               88
flavanoids              88
free sulfur dioxide     88
total sulfur dioxide    88
density                 88
pH                      88
sulphates               88
magnesium               88
alcohol                 88
lightness               88
dtype: int64

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness
7,7.3,0.65,0.00,1.2,0.07,0.53,15.0,21.0,1.0,3.39,0.47,0.89,10.00,0.104
8,7.8,0.58,0.02,2.0,0.07,0.53,9.0,18.0,1.0,3.36,0.57,0.10,9.50,0.108
16,8.5,0.28,0.56,1.8,0.09,0.53,13.0,103.0,1.0,3.30,0.75,0.93,10.50,0.096
36,7.8,0.60,0.14,2.4,0.09,0.53,3.0,15.0,1.0,3.31,0.60,0.75,10.80,0.098
37,8.1,0.38,0.28,2.1,0.07,0.53,13.0,30.0,1.0,3.23,0.73,0.97,9.70,0.106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,7.1,0.88,0.05,5.7,0.08,0.53,3.0,14.0,1.0,3.31,0.52,0.96,10.20,0.101
1481,8.2,0.28,0.60,3.0,0.10,0.53,10.0,22.0,1.0,3.39,0.68,0.01,10.60,0.097
1505,6.7,0.76,0.02,1.8,0.08,0.53,6.0,12.0,1.0,3.31,0.63,0.18,9.95,0.102
1537,5.4,0.58,0.08,1.9,0.06,0.53,20.0,31.0,1.0,3.31,0.64,0.82,10.20,0.108


## REPLACE NOISES WITH CALCULATED VALUES

In [25]:
wine_red_with_calculated_noise = wine_red.copy(deep=True)
wine_red_with_calculated_noise.loc[noise_data_normalized.index]=noise_data_normalized

display(wine_red_with_calculated_noise.loc[noise_data_normalized.index])

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
7,7.3,0.65,0.00,1.2,0.07,0.53,15.0,21.0,1.0,3.39,0.47,0.89,10.00,0.104,7.0
8,7.8,0.58,0.02,2.0,0.07,0.53,9.0,18.0,1.0,3.36,0.57,0.10,9.50,0.108,7.0
16,8.5,0.28,0.56,1.8,0.09,0.53,13.0,103.0,1.0,3.30,0.75,0.93,10.50,0.096,7.0
36,7.8,0.60,0.14,2.4,0.09,0.53,3.0,15.0,1.0,3.31,0.60,0.75,10.80,0.098,6.0
37,8.1,0.38,0.28,2.1,0.07,0.53,13.0,30.0,1.0,3.23,0.73,0.97,9.70,0.106,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,7.1,0.88,0.05,5.7,0.08,0.53,3.0,14.0,1.0,3.31,0.52,0.96,10.20,0.101,3.0
1481,8.2,0.28,0.60,3.0,0.10,0.53,10.0,22.0,1.0,3.39,0.68,0.01,10.60,0.097,5.0
1505,6.7,0.76,0.02,1.8,0.08,0.53,6.0,12.0,1.0,3.31,0.63,0.18,9.95,0.102,3.0
1537,5.4,0.58,0.08,1.9,0.06,0.53,20.0,31.0,1.0,3.31,0.64,0.82,10.20,0.108,6.0


## REMOVE ALL NOISES

In [26]:
wine_red_without_noise = wine_red.drop(index=noise_data.index)

# Feature Scaling

In [27]:
from datasets import normalize_feature

wine_red_with_calculated_noise = normalize_feature(wine_red_with_calculated_noise, 'total sulfur dioxide')
wine_red_without_noise = normalize_feature(wine_red_without_noise, 'total sulfur dioxide')
wine_red_with_calculated_noise = normalize_feature(wine_red_with_calculated_noise, 'free sulfur dioxide')
wine_red_without_noise = normalize_feature(wine_red_without_noise, 'free sulfur dioxide')

display(wine_red_with_calculated_noise)
display(wine_red_without_noise)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,7.4,0.70,0.00,1.9,0.08,0.53,0.188679,0.187919,1.00,3.51,0.56,0.86,9.4,0.109,5.0
1,7.8,0.88,0.00,2.6,0.10,0.53,0.452830,0.409396,1.00,3.20,0.68,0.56,9.8,0.107,5.0
2,7.8,0.76,0.04,2.3,0.09,0.53,0.264151,0.322148,1.00,3.26,0.65,0.47,9.8,0.106,5.0
3,11.2,0.28,0.56,1.9,0.08,0.53,0.301887,0.362416,1.00,3.16,0.58,0.33,9.8,0.111,6.0
4,7.4,0.70,0.00,1.9,0.08,0.53,0.188679,0.187919,1.00,3.51,0.56,0.91,9.4,0.107,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,6.2,0.60,0.08,2.0,0.09,0.53,0.584906,0.255034,0.99,3.31,0.58,0.98,10.5,0.102,5.0
1593,5.9,0.55,0.10,2.2,0.06,0.53,0.716981,0.302013,1.00,3.52,0.76,0.82,11.2,0.090,6.0
1594,6.3,0.51,0.13,2.3,0.08,0.53,0.528302,0.228188,1.00,3.42,0.75,0.71,11.0,0.095,6.0
1595,5.9,0.65,0.12,2.0,0.08,0.53,0.584906,0.255034,1.00,3.57,0.71,0.33,10.2,0.104,5.0


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,7.4,0.70,0.00,1.9,0.08,0.53,0.188679,0.187919,1.00,3.51,0.56,0.86,9.4,0.109,5.0
1,7.8,0.88,0.00,2.6,0.10,0.53,0.452830,0.409396,1.00,3.20,0.68,0.56,9.8,0.107,5.0
2,7.8,0.76,0.04,2.3,0.09,0.53,0.264151,0.322148,1.00,3.26,0.65,0.47,9.8,0.106,5.0
3,11.2,0.28,0.56,1.9,0.08,0.53,0.301887,0.362416,1.00,3.16,0.58,0.33,9.8,0.111,6.0
4,7.4,0.70,0.00,1.9,0.08,0.53,0.188679,0.187919,1.00,3.51,0.56,0.91,9.4,0.107,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,6.2,0.60,0.08,2.0,0.09,0.53,0.584906,0.255034,0.99,3.31,0.58,0.98,10.5,0.102,5.0
1593,5.9,0.55,0.10,2.2,0.06,0.53,0.716981,0.302013,1.00,3.52,0.76,0.82,11.2,0.090,6.0
1594,6.3,0.51,0.13,2.3,0.08,0.53,0.528302,0.228188,1.00,3.42,0.75,0.71,11.0,0.095,6.0
1595,5.9,0.65,0.12,2.0,0.08,0.53,0.584906,0.255034,1.00,3.57,0.71,0.33,10.2,0.104,5.0


In [None]:

wine_red_with_calculated_noise = normalize_feature(wine_red_with_calculated_noise, 'total sulfur dioxide')
wine_red_without_noise = normalize_feature(wine_red_without_noise, 'total sulfur dioxide')
wine_red_with_calculated_noise = normalize_feature(wine_red_with_calculated_noise, 'free sulfur dioxide')
wine_red_without_noise = normalize_feature(wine_red_without_noise, 'free sulfur dioxide')

display(wine_red_with_calculated_noise)
display(wine_red_without_noise)

# Feature selection

calculate information gain and vif after handling missing values. Compare it to results from exploration phase

In [11]:
from datasets import get_kbest_sorted
from sklearn.feature_selection import mutual_info_regression

print('Information Gain without noise')
x = wine_red_without_noise.drop('quality', axis=1)
y = wine_red_without_noise['quality']
display(get_kbest_sorted(mutual_info_regression, x, y, 25))

print('Information Gain with calculated noise')
x = wine_red_with_calculated_noise.drop('quality', axis=1)
y = wine_red_with_calculated_noise['quality']
display(get_kbest_sorted(mutual_info_regression, x, y, 25))

Information Gain without noise


Unnamed: 0,score
alcohol,0.189062
lightness,0.159752
sulphates,0.114325
volatile acidity,0.110003
total sulfur dioxide,0.087028
citric acid,0.077988
fixed acidity,0.045807
chlorides,0.027774
pH,0.025088
density,0.022541


Information Gain with calculated noise


Unnamed: 0,score
alcohol,0.182394
lightness,0.128375
sulphates,0.117595
volatile acidity,0.114171
total sulfur dioxide,0.085487
citric acid,0.069958
fixed acidity,0.058059
pH,0.057919
density,0.024662
residual sugar,0.022913


In [12]:
from datasets import get_vifs

x = wine_red_without_noise.drop('quality', axis=1)
vif_factors = get_vifs(x)
display('VIF Factors without noise', vif_factors)
x = wine_red_with_calculated_noise.drop('quality', axis=1)
vif_factors = get_vifs(x)
display('VIF Factors with calculated noise', vif_factors)

'VIF Factors without noise'

fixed acidity                2.33
volatile acidity             1.83
citric acid                  3.20
residual sugar               1.10
chlorides                    1.11
flavanoids              150997.92
free sulfur dioxide          1.92
total sulfur dioxide         2.18
density                      1.56
pH                           1.00
sulphates                    1.23
magnesium                    1.01
alcohol                     10.01
lightness                    9.37
dtype: float64

'VIF Factors with calculated noise'

fixed acidity                2.21
volatile acidity             1.81
citric acid                  3.05
residual sugar               1.10
chlorides                    1.11
flavanoids              151523.19
free sulfur dioxide          1.92
total sulfur dioxide         2.17
density                      1.50
pH                           1.00
sulphates                    1.22
magnesium                    1.00
alcohol                      8.45
lightness                    7.85
dtype: float64

missing values don't seem to have a big effect on information gain and vif

In [13]:
from datasets import get_logist_regression_kfold_score

X = wine_red.drop('quality', axis=1)
y = wine_red['quality']
score_before = get_logist_regression_kfold_score(X,y)
print('Score before: ', score_before)

selected_features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','magnesium','alcohol','lightness','quality']
selected_features = ['fixed acidity','citric acid','chlorides','total sulfur dioxide','sulphates','alcohol','quality']
wine_red_without_noise_selected = wine_red_without_noise[selected_features]
wine_red_with_calculated_noise_selected = wine_red_with_calculated_noise[selected_features]

X = wine_red_without_noise_selected.drop('quality', axis=1)
y = wine_red_without_noise_selected['quality']

score_after = get_logist_regression_kfold_score(X,y)
print('Score after without noise: ', score_after)

X = wine_red_with_calculated_noise_selected.drop('quality', axis=1)
y = wine_red_with_calculated_noise_selected['quality']

score_after = get_logist_regression_kfold_score(X,y)
print('Score after with calculated noise: ', score_after)

wine_red_without_noise = wine_red_without_noise[selected_features]
wine_red_with_calculated_noise = wine_red_with_calculated_noise[selected_features]

Score before:  0.5673202614379085
Score after without noise:  0.6040221914008321
Score after with calculated noise:  0.5777777777777777


Compare result to PCA 

In [14]:
import numpy as np
import pandas as pd
from datasets import get_logist_regression_kfold_score
from sklearn import decomposition

#Perform PCA with 3 dimensions
num_components=3
pca = decomposition.PCA(n_components=2)
#Only use features
x=wine_red_without_noise.drop('quality', axis=1)
pca.fit(x)
x_tran = pca.transform(x)
y_tran = wine_red_without_noise['quality']

X = wine_red.drop('quality', axis=1)
y = wine_red['quality']
score_before = get_logist_regression_kfold_score(X,y)
print('Score before: ', score_before)
score_after = get_logist_regression_kfold_score(x_tran,y_tran)
print('Score after without noise: ', score_after)

pca = decomposition.PCA(n_components=2)
#Only use features
x=wine_red_with_calculated_noise.drop('quality', axis=1)
pca.fit(x)
x_tran = pca.transform(x)
y_tran = wine_red_with_calculated_noise['quality']

score_after = get_logist_regression_kfold_score(x_tran,y_tran)
print('Score after with calculated noise: ', score_after)

Score before:  0.5673202614379085
Score after without noise:  0.5880721220527045
Score after with calculated noise:  0.5581699346405229


## Forward Selection

In [67]:
from datasets import get_logist_regression_kfold_score

def perform_forward_selection(data, columns):
    score = 0
    columns_to_use = []
    for column in columns:
        columns_to_use.append(column)
        x = data[columns_to_use] 
        y = data['quality']
        score_temp = get_logist_regression_kfold_score(x,y)
        if score_temp - score < 0:
            columns_to_use.remove(column)
            print(f"skipping feature {column}")
            continue
        score = score_temp
        print(f"Score with features {columns_to_use}: {score}")
        return columns_to_use

columns = wine_red_without_noise.columns.tolist()
columns.remove('quality')
perform_forward_selection(wine_red_without_noise, columns)

columns = wine_red_with_calculated_noise.columns.tolist()
columns.remove('quality')
perform_forward_selection(wine_red_with_calculated_noise, columns)

Score with features ['fixed acidity']: 0.44375
Score with features ['fixed acidity', 'volatile acidity']: 0.5090277777777777
skipping feature citric acid
skipping feature residual sugar
skipping feature chlorides
skipping feature flavanoids
skipping feature free sulfur dioxide
Score with features ['fixed acidity', 'volatile acidity', 'total sulfur dioxide']: 0.5375
skipping feature density
skipping feature pH
Score with features ['fixed acidity', 'volatile acidity', 'total sulfur dioxide', 'sulphates']: 0.5715277777777777
skipping feature magnesium
Score with features ['fixed acidity', 'volatile acidity', 'total sulfur dioxide', 'sulphates', 'alcohol']: 0.6145833333333334
Score with features ['fixed acidity', 'volatile acidity', 'total sulfur dioxide', 'sulphates', 'alcohol', 'lightness']: 0.6173611111111111
Score with features ['fixed acidity']: 0.4181937172774869
Score with features ['fixed acidity', 'volatile acidity']: 0.4829842931937173
skipping feature citric acid
Score with feat

# Evaluation

In [17]:
from datasets import wine_red_dataset, get_logist_regression_kfold_score

wine_red_pure = wine_red_dataset().dropna()

x = wine_red_pure.drop('quality', axis=1).drop('ID', axis=1)
y = wine_red_pure['quality']
score_before = get_logist_regression_kfold_score(x,y)
print('Score before preprocessing', score_before)

x = wine_red_without_noise.drop('quality', axis=1)
y = wine_red_without_noise['quality']
score_after = get_logist_regression_kfold_score(x,y)
print('Score after preprocessing without noise', score_after)

x = wine_red_with_calculated_noise.drop('quality', axis=1)
y = wine_red_with_calculated_noise['quality']
score_after = get_logist_regression_kfold_score(x,y)
print('Score after preprocessing with calculated noise', score_after)

Score before preprocessing 0.5533546325878594
Score after preprocessing without noise 0.6040221914008321
Score after preprocessing with calculated noise 0.5777777777777777
