In [80]:
from datasets import wine_white_dataset, wine_white_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

wine_white = wine_white_dataset().drop('ID', axis=1)

wine_white.dropna(how='all', inplace=True)

# Modeling-Missing Values

Information from the exploration: Missing values in the pH (15 values) and fixed acidity (17 values) data

Fixed acidity is a missing at random value: In our case, missing fixed acidity values can be calculated from the citric acid values.     PH seem to be missing completely at random: There is no major correlation with other features

Fixed acidity missing values are calculated by a multiple imputation because the estimation can provide
more realistic standard errors. The multiple Imputation creates multiple data
with different estimations. The results of different models are
averaged or combined.

In [81]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from datasets import show_mvs

columns = ['fixed acidity','pH']
w_white_imp=wine_white[columns]
mvs = show_mvs(w_white_imp)

display(w_white_imp)

imp = IterativeImputer(max_iter=20)
transformed_x= imp.fit_transform(w_white_imp)
df_missing_v=pd.DataFrame(transformed_x, columns=columns)
display("Regression imputed pH: \n", df_missing_v.loc[mvs.index])
wine_white['pH'] = df_missing_v['pH']
show_mvs(wine_white)
print()

Unnamed: 0,fixed acidity,pH
75,7.4,
76,7.1,
77,7.0,
152,6.9,
263,7.2,
264,6.2,
330,6.4,
331,6.8,
439,6.2,
520,7.9,


Unnamed: 0,fixed acidity,pH
0,7.0,3.00
1,6.3,3.30
2,8.1,3.26
3,7.2,3.19
4,7.2,3.19
...,...,...
4891,6.2,3.27
4892,6.6,3.15
4893,6.5,2.99
4894,5.5,3.34


'Regression imputed pH: \n'

Unnamed: 0,fixed acidity,pH
75,7.4,3.146776
76,7.1,3.169643
77,7.0,3.177265
152,6.9,3.184887
263,7.2,3.16202
264,6.2,3.238244
330,6.4,3.222999
331,6.8,3.19251
439,6.2,3.238244
520,7.9,3.108664


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness,quality





# Outliers

Outlier Detection

In [82]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import IsolationForest
from datasets import get_logist_regression_kfold_score, show_mvs

display(wine_white.shape)
X = wine_white.drop('quality', axis=1)
y = wine_white['quality']

iso = IsolationForest(contamination=0.05)
y_out = iso.fit_predict(X)

score_before = get_logist_regression_kfold_score(X,y)
print('Score before: ', score_before)

# build a mask to select all rows that are not outliers (inlier=1, outlier=-1)
mask = y_out != -1
X_inlier, y_inlier = X[mask], y[mask]
X_outliers, y_outliers = X[~mask], y[~mask]

score_after = get_logist_regression_kfold_score(X_inlier,y_inlier)
print('Score after: ', score_after)

# Inliers vs. Outliers
print("Inliers: ",X_inlier.shape[0],"Outliers:",X_outliers.shape)
display('Outliers data:', X_outliers)
display('Inliers data:', X_inlier)

# wine_white = X_inlier
# wine_white['quality'] = y_inlier

(4896, 15)

Score before:  0.45383986928104575
Score after:  0.4631262094173296
Inliers:  4651 Outliers: (245, 14)


'Outliers data:'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness
14,8.3,0.42,0.62,19.25,0.040,0.38,41.0,172.0,1.00020,2.98,0.46,0.67,9.7,0.12
17,6.2,0.66,0.48,1.20,0.029,0.38,29.0,75.0,0.98920,3.33,0.53,0.39,12.8,0.09
20,6.2,0.66,0.48,1.20,0.029,0.38,29.0,75.0,0.98920,3.33,0.46,0.39,12.8,0.08
23,7.6,0.67,0.14,1.50,0.074,0.38,25.0,168.0,0.99370,3.05,0.92,0.51,9.3,0.13
98,9.8,0.36,0.46,10.50,0.038,0.38,4.0,83.0,0.99560,2.89,0.27,0.30,10.1,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4845,4.4,0.54,0.09,5.10,0.038,0.38,52.0,97.0,0.99022,3.41,0.75,0.40,12.2,0.09
4865,5.3,0.60,0.34,1.40,0.031,0.38,3.0,60.0,0.98854,3.27,0.27,0.38,13.0,0.10
4875,5.9,0.54,0.00,0.80,0.032,0.38,12.0,82.0,0.99286,3.25,0.75,0.36,8.8,0.12
4881,4.9,0.47,0.17,1.90,0.035,0.38,60.0,148.0,0.98964,3.27,0.43,0.35,11.5,0.09


'Inliers data:'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness
0,7.0,0.27,0.36,20.7,0.045,0.38,45.0,170.0,1.00100,3.00,0.46,0.45,8.8,0.13
1,6.3,0.30,0.34,1.6,0.049,0.38,14.0,132.0,0.99400,3.30,0.56,0.49,9.5,0.11
2,8.1,0.28,0.40,6.9,0.050,0.38,30.0,97.0,0.99510,3.26,0.56,0.44,10.1,0.11
3,7.2,0.23,0.32,8.5,0.058,0.38,47.0,186.0,0.99560,3.19,0.53,0.40,9.9,0.12
4,7.2,0.23,0.32,8.5,0.058,0.38,47.0,186.0,0.99560,3.19,0.52,0.40,9.9,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,6.2,0.21,0.29,1.6,0.039,0.38,24.0,92.0,0.99114,3.27,0.26,0.50,11.2,0.10
4892,6.6,0.32,0.36,8.0,0.047,0.38,57.0,168.0,0.99490,3.15,0.85,0.46,9.6,0.12
4893,6.5,0.24,0.19,1.2,0.041,0.38,30.0,111.0,0.99254,2.99,0.91,0.46,9.4,0.12
4894,5.5,0.29,0.30,1.1,0.022,0.38,20.0,110.0,0.98869,3.34,0.24,0.38,12.8,0.08


https://www.pluralsight.com/guides/cleaning-up-data-from-outliers

# Noise Handling

Our assumption: The found noise data are feature noises --> jedes feature nach diesem Vorgehen bearbeiten? Oder die 58 einfach Instance löschen oder???

In [83]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_predict

# Define 3 classifiers
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

# define data
x = wine_white.drop('quality', axis=1)
y = wine_white['quality']

# merge 3 classifiers into one voting classifier
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

# train voting classifier with k-fold method
y_pred = cross_val_predict(eclf1, x, y, cv=3)

# save predictions, original quality and correct prediction boolean in data frame
result = pd.DataFrame(y_pred, columns=['Prediction'], index=x.index)
result['quality'] = y
# mindestens 2 Abweichung
delta_treshold = 2
result['Correct Prediction']= abs(result['Prediction'] - result['quality']) < delta_treshold

display(result.shape)
display(result)

# select all incorrect predicted data
print('False predictions')
delta_result = result[result['Correct Prediction'] == False]
display(delta_result.shape)
display(delta_result)

display('Outliers data', X_outliers)

outlier_to_delete = []

for outlier in X_outliers.index:
    if not outlier in delta_result.index:
        outlier_to_delete.append(outlier)
display(outlier_to_delete)
wine_white.drop(index=outlier_to_delete, inplace=True)
display(wine_white)
# handle noise

(4896, 3)

Unnamed: 0,Prediction,quality,Correct Prediction
0,5,6,True
1,5,6,True
2,6,6,True
3,5,6,True
4,5,6,True
...,...,...,...
4891,6,6,True
4892,6,5,True
4893,6,6,True
4894,7,7,True


False predictions


(363, 3)

Unnamed: 0,Prediction,quality,Correct Prediction
16,4,6,False
17,4,8,False
20,4,8,False
22,6,8,False
31,4,6,False
...,...,...,...
4772,6,4,False
4802,7,4,False
4837,7,4,False
4850,7,5,False


'Outliers data'

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness
14,8.3,0.42,0.62,19.25,0.040,0.38,41.0,172.0,1.00020,2.98,0.46,0.67,9.7,0.12
17,6.2,0.66,0.48,1.20,0.029,0.38,29.0,75.0,0.98920,3.33,0.53,0.39,12.8,0.09
20,6.2,0.66,0.48,1.20,0.029,0.38,29.0,75.0,0.98920,3.33,0.46,0.39,12.8,0.08
23,7.6,0.67,0.14,1.50,0.074,0.38,25.0,168.0,0.99370,3.05,0.92,0.51,9.3,0.13
98,9.8,0.36,0.46,10.50,0.038,0.38,4.0,83.0,0.99560,2.89,0.27,0.30,10.1,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4845,4.4,0.54,0.09,5.10,0.038,0.38,52.0,97.0,0.99022,3.41,0.75,0.40,12.2,0.09
4865,5.3,0.60,0.34,1.40,0.031,0.38,3.0,60.0,0.98854,3.27,0.27,0.38,13.0,0.10
4875,5.9,0.54,0.00,0.80,0.032,0.38,12.0,82.0,0.99286,3.25,0.75,0.36,8.8,0.12
4881,4.9,0.47,0.17,1.90,0.035,0.38,60.0,148.0,0.98964,3.27,0.43,0.35,11.5,0.09


[14,
 23,
 110,
 115,
 147,
 169,
 178,
 182,
 194,
 195,
 196,
 207,
 403,
 433,
 459,
 484,
 506,
 528,
 620,
 686,
 700,
 728,
 756,
 761,
 762,
 765,
 807,
 872,
 877,
 914,
 925,
 945,
 969,
 978,
 979,
 981,
 1023,
 1035,
 1042,
 1050,
 1098,
 1113,
 1151,
 1157,
 1162,
 1177,
 1216,
 1244,
 1253,
 1303,
 1368,
 1371,
 1385,
 1393,
 1422,
 1435,
 1453,
 1486,
 1533,
 1535,
 1539,
 1543,
 1550,
 1579,
 1589,
 1595,
 1597,
 1602,
 1607,
 1637,
 1652,
 1662,
 1663,
 1721,
 1726,
 1731,
 1774,
 1806,
 1808,
 1830,
 1834,
 1847,
 1855,
 1899,
 1924,
 1925,
 1939,
 1941,
 1950,
 1957,
 1962,
 1972,
 1994,
 1996,
 1997,
 2005,
 2023,
 2024,
 2049,
 2091,
 2097,
 2107,
 2153,
 2161,
 2163,
 2185,
 2249,
 2268,
 2278,
 2320,
 2348,
 2370,
 2377,
 2393,
 2413,
 2418,
 2421,
 2423,
 2593,
 2624,
 2628,
 2631,
 2633,
 2636,
 2653,
 2720,
 2780,
 2819,
 2848,
 2871,
 2873,
 2892,
 2944,
 2961,
 2983,
 2989,
 3006,
 3007,
 3041,
 3148,
 3218,
 3305,
 3411,
 3418,
 3456,
 3495,
 3554,
 3569,
 3

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness,quality
0,7.0,0.27,0.36,20.7,0.045,0.38,45.0,170.0,1.00100,3.00,0.46,0.45,8.8,0.13,6
1,6.3,0.30,0.34,1.6,0.049,0.38,14.0,132.0,0.99400,3.30,0.56,0.49,9.5,0.11,6
2,8.1,0.28,0.40,6.9,0.050,0.38,30.0,97.0,0.99510,3.26,0.56,0.44,10.1,0.11,6
3,7.2,0.23,0.32,8.5,0.058,0.38,47.0,186.0,0.99560,3.19,0.53,0.40,9.9,0.12,6
4,7.2,0.23,0.32,8.5,0.058,0.38,47.0,186.0,0.99560,3.19,0.52,0.40,9.9,0.10,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,6.2,0.21,0.29,1.6,0.039,0.38,24.0,92.0,0.99114,3.27,0.26,0.50,11.2,0.10,6
4892,6.6,0.32,0.36,8.0,0.047,0.38,57.0,168.0,0.99490,3.15,0.85,0.46,9.6,0.12,5
4893,6.5,0.24,0.19,1.2,0.041,0.38,30.0,111.0,0.99254,2.99,0.91,0.46,9.4,0.12,6
4894,5.5,0.29,0.30,1.1,0.022,0.38,20.0,110.0,0.98869,3.34,0.24,0.38,12.8,0.08,7


In [72]:
# pd.set_option('display.max_rows', None)
display(wine_white)
# pd.set_option('display.max_rows', 32)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness,quality
0,7.0,0.27,0.36,20.7,0.045,0.38,45.0,170.0,1.00100,3.00,0.46,0.45,8.8,0.13,6
1,6.3,0.30,0.34,1.6,0.049,0.38,14.0,132.0,0.99400,3.30,0.56,0.49,9.5,0.11,6
2,8.1,0.28,0.40,6.9,0.050,0.38,30.0,97.0,0.99510,3.26,0.56,0.44,10.1,0.11,6
3,7.2,0.23,0.32,8.5,0.058,0.38,47.0,186.0,0.99560,3.19,0.53,0.40,9.9,0.12,6
4,7.2,0.23,0.32,8.5,0.058,0.38,47.0,186.0,0.99560,3.19,0.52,0.40,9.9,0.10,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,6.2,0.21,0.29,1.6,0.039,0.38,24.0,92.0,0.99114,3.27,0.26,0.50,11.2,0.10,6
4892,6.6,0.32,0.36,8.0,0.047,0.38,57.0,168.0,0.99490,3.15,0.85,0.46,9.6,0.12,5
4893,6.5,0.24,0.19,1.2,0.041,0.38,30.0,111.0,0.99254,2.99,0.91,0.46,9.4,0.12,6
4894,5.5,0.29,0.30,1.1,0.022,0.38,20.0,110.0,0.98869,3.34,0.24,0.38,12.8,0.08,7


In [84]:
from sklearn.preprocessing import MinMaxScaler
# normalize data without noise as a reference value
scaler = MinMaxScaler()
without_noise = wine_white.drop('quality', axis=1).drop(index=delta_result.index)
# calculate medians for features of the normalized data without noise
medians_all = pd.DataFrame(scaler.fit_transform(without_noise), columns = without_noise.columns, index=without_noise.index).describe().loc['50%']
medians_all_data = without_noise.describe().loc['50%']

# set minimum threshold for noise identification as difference between value and median
threshold = 0.5

# normalize noise data for comparison with median
scaler = MinMaxScaler()
noise_data = wine_white.loc[delta_result.index].drop('quality', axis=1)
noise_data_normalized = pd.DataFrame(scaler.fit_transform(noise_data), columns=noise_data.columns, index=noise_data.index)
for row in noise_data_normalized.index:
    for column in noise_data_normalized.columns:
        col_row_value = noise_data_normalized[column][row]
        if abs(medians_all.loc[column] - col_row_value) > threshold:
            noise_data_normalized.at[row, column] = medians_all_data.loc[column]
        else:
            noise_data_normalized.at[row, column] = noise_data.at[row, column]

# drop all data that are no noise
noise_data_normalized = noise_data_normalized.dropna(how='all')

# display sum of noisy identified data by features
# display(noise_data_normalized.count())

# display noisy features
display(noise_data_normalized)

noise_data_normalized['quality'] = wine_white.loc[delta_result.index]['quality']

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness
16,6.3,0.480,0.04,1.10,0.046,0.38,30.0,99.0,0.99280,3.24,0.56,0.36,9.6,0.11
17,6.2,0.260,0.48,1.20,0.029,0.38,29.0,75.0,0.98920,3.33,0.53,0.39,10.4,0.09
20,6.2,0.260,0.48,1.20,0.029,0.38,29.0,75.0,0.98920,3.33,0.46,0.39,10.4,0.11
22,6.8,0.260,0.42,1.70,0.049,0.38,41.0,122.0,0.99300,3.47,0.52,0.48,10.5,0.10
31,8.3,0.140,0.34,1.10,0.042,0.38,7.0,47.0,0.99340,3.47,0.90,0.40,10.2,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4772,7.1,0.240,0.34,1.20,0.045,0.38,6.0,132.0,0.99132,3.16,0.42,0.46,11.2,0.10
4802,6.0,0.350,0.46,0.90,0.033,0.38,9.0,65.0,0.98934,3.24,0.75,0.35,12.1,0.09
4837,5.2,0.405,0.15,1.45,0.038,0.38,10.0,44.0,0.99125,3.52,0.23,0.40,11.6,0.09
4850,6.2,0.230,0.38,1.60,0.044,0.38,12.0,113.0,0.99176,3.30,0.58,0.73,11.4,0.10


## REPLACE NOISES WITH CALCULATED VALUES

In [85]:
wine_white_with_calculated_noise = wine_white.copy(deep=True)
wine_white_with_calculated_noise.loc[noise_data_normalized.index]=noise_data_normalized

display(wine_white_with_calculated_noise.loc[noise_data_normalized.index])

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness,quality
16,6.3,0.480,0.04,1.10,0.046,0.38,30.0,99.0,0.99280,3.24,0.56,0.36,9.6,0.11,6
17,6.2,0.260,0.48,1.20,0.029,0.38,29.0,75.0,0.98920,3.33,0.53,0.39,10.4,0.09,8
20,6.2,0.260,0.48,1.20,0.029,0.38,29.0,75.0,0.98920,3.33,0.46,0.39,10.4,0.11,8
22,6.8,0.260,0.42,1.70,0.049,0.38,41.0,122.0,0.99300,3.47,0.52,0.48,10.5,0.10,8
31,8.3,0.140,0.34,1.10,0.042,0.38,7.0,47.0,0.99340,3.47,0.90,0.40,10.2,0.11,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4772,7.1,0.240,0.34,1.20,0.045,0.38,6.0,132.0,0.99132,3.16,0.42,0.46,11.2,0.10,4
4802,6.0,0.350,0.46,0.90,0.033,0.38,9.0,65.0,0.98934,3.24,0.75,0.35,12.1,0.09,4
4837,5.2,0.405,0.15,1.45,0.038,0.38,10.0,44.0,0.99125,3.52,0.23,0.40,11.6,0.09,4
4850,6.2,0.230,0.38,1.60,0.044,0.38,12.0,113.0,0.99176,3.30,0.58,0.73,11.4,0.10,5


## REMOVE ALL NOISES

In [86]:
wine_white_without_noise = wine_white.drop(index=noise_data.index)

# Feature Scaling

Normalize only total sulfur dioxide and free sulfur dioxide

In [87]:
from datasets import normalize_feature


wine_white_with_calculated_noise = normalize_feature(wine_white_with_calculated_noise, 'total sulfur dioxide')
wine_white_without_noise = normalize_feature(wine_white_without_noise, 'total sulfur dioxide')
wine_white_with_calculated_noise = normalize_feature(wine_white_with_calculated_noise, 'free sulfur dioxide')
wine_white_without_noise = normalize_feature(wine_white_without_noise, 'free sulfur dioxide')

display(wine_white_with_calculated_noise)
display(wine_white_without_noise)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness,quality
0,7.0,0.27,0.36,20.7,0.045,0.38,0.356846,0.480597,1.00100,3.00,0.46,0.45,8.8,0.13,6
1,6.3,0.30,0.34,1.6,0.049,0.38,0.099585,0.367164,0.99400,3.30,0.56,0.49,9.5,0.11,6
2,8.1,0.28,0.40,6.9,0.050,0.38,0.232365,0.262687,0.99510,3.26,0.56,0.44,10.1,0.11,6
3,7.2,0.23,0.32,8.5,0.058,0.38,0.373444,0.528358,0.99560,3.19,0.53,0.40,9.9,0.12,6
4,7.2,0.23,0.32,8.5,0.058,0.38,0.373444,0.528358,0.99560,3.19,0.52,0.40,9.9,0.10,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,6.2,0.21,0.29,1.6,0.039,0.38,0.182573,0.247761,0.99114,3.27,0.26,0.50,11.2,0.10,6
4892,6.6,0.32,0.36,8.0,0.047,0.38,0.456432,0.474627,0.99490,3.15,0.85,0.46,9.6,0.12,5
4893,6.5,0.24,0.19,1.2,0.041,0.38,0.232365,0.304478,0.99254,2.99,0.91,0.46,9.4,0.12,6
4894,5.5,0.29,0.30,1.1,0.022,0.38,0.149378,0.301493,0.98869,3.34,0.24,0.38,12.8,0.08,7


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness,quality
0,7.0,0.27,0.36,20.7,0.045,0.38,0.356846,0.449367,1.00100,3.00,0.46,0.45,8.8,0.13,6
1,6.3,0.30,0.34,1.6,0.049,0.38,0.099585,0.329114,0.99400,3.30,0.56,0.49,9.5,0.11,6
2,8.1,0.28,0.40,6.9,0.050,0.38,0.232365,0.218354,0.99510,3.26,0.56,0.44,10.1,0.11,6
3,7.2,0.23,0.32,8.5,0.058,0.38,0.373444,0.500000,0.99560,3.19,0.53,0.40,9.9,0.12,6
4,7.2,0.23,0.32,8.5,0.058,0.38,0.373444,0.500000,0.99560,3.19,0.52,0.40,9.9,0.10,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,6.2,0.21,0.29,1.6,0.039,0.38,0.182573,0.202532,0.99114,3.27,0.26,0.50,11.2,0.10,6
4892,6.6,0.32,0.36,8.0,0.047,0.38,0.456432,0.443038,0.99490,3.15,0.85,0.46,9.6,0.12,5
4893,6.5,0.24,0.19,1.2,0.041,0.38,0.232365,0.262658,0.99254,2.99,0.91,0.46,9.4,0.12,6
4894,5.5,0.29,0.30,1.1,0.022,0.38,0.149378,0.259494,0.98869,3.34,0.24,0.38,12.8,0.08,7


Normalize all features

In [76]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
wine_white_normalized = pd.DataFrame(scaler.fit_transform(wine_white_without_noise.drop('quality',axis=1)), columns=wine_white_without_noise.columns.drop('quality'), index=wine_white_without_noise.index)
wine_white_without_noise[wine_white_normalized.columns] = wine_white_normalized
display(wine_white_normalized)
display(wine_white_without_noise)

scaler = MinMaxScaler()
wine_white_normalized = pd.DataFrame(scaler.fit_transform(wine_white_with_calculated_noise.drop('quality',axis=1)), columns=wine_white_with_calculated_noise.columns.drop('quality'), index=wine_white_with_calculated_noise.index)
wine_white_with_calculated_noise[wine_white_normalized.columns] = wine_white_normalized
display(wine_white_normalized)
display(wine_white_with_calculated_noise)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness
0,0.265306,0.246753,0.395604,0.789784,0.144737,0.0,0.356846,0.449367,0.876894,0.203883,0.46,0.267442,0.068966,0.857143
1,0.193878,0.285714,0.373626,0.039293,0.162281,0.0,0.099585,0.329114,0.434975,0.495146,0.56,0.313953,0.189655,0.571429
2,0.377551,0.259740,0.439560,0.247544,0.166667,0.0,0.232365,0.218354,0.504419,0.456311,0.56,0.255814,0.293103,0.571429
3,0.285714,0.194805,0.351648,0.310413,0.201754,0.0,0.373444,0.500000,0.535985,0.388350,0.53,0.209302,0.258621,0.714286
4,0.285714,0.194805,0.351648,0.310413,0.201754,0.0,0.373444,0.500000,0.535985,0.388350,0.52,0.209302,0.258621,0.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,0.183673,0.168831,0.318681,0.039293,0.118421,0.0,0.182573,0.202532,0.254419,0.466019,0.26,0.325581,0.482759,0.428571
4892,0.224490,0.311688,0.395604,0.290766,0.153509,0.0,0.456432,0.443038,0.491793,0.349515,0.85,0.279070,0.206897,0.714286
4893,0.214286,0.207792,0.208791,0.023576,0.127193,0.0,0.232365,0.262658,0.342803,0.194175,0.91,0.279070,0.172414,0.714286
4894,0.112245,0.272727,0.329670,0.019646,0.043860,0.0,0.149378,0.259494,0.099747,0.533981,0.24,0.186047,0.758621,0.142857


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness,quality
0,0.265306,0.246753,0.395604,0.789784,0.144737,0.0,0.356846,0.449367,0.876894,0.203883,0.46,0.267442,0.068966,0.857143,6
1,0.193878,0.285714,0.373626,0.039293,0.162281,0.0,0.099585,0.329114,0.434975,0.495146,0.56,0.313953,0.189655,0.571429,6
2,0.377551,0.259740,0.439560,0.247544,0.166667,0.0,0.232365,0.218354,0.504419,0.456311,0.56,0.255814,0.293103,0.571429,6
3,0.285714,0.194805,0.351648,0.310413,0.201754,0.0,0.373444,0.500000,0.535985,0.388350,0.53,0.209302,0.258621,0.714286,6
4,0.285714,0.194805,0.351648,0.310413,0.201754,0.0,0.373444,0.500000,0.535985,0.388350,0.52,0.209302,0.258621,0.428571,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,0.183673,0.168831,0.318681,0.039293,0.118421,0.0,0.182573,0.202532,0.254419,0.466019,0.26,0.325581,0.482759,0.428571,6
4892,0.224490,0.311688,0.395604,0.290766,0.153509,0.0,0.456432,0.443038,0.491793,0.349515,0.85,0.279070,0.206897,0.714286,5
4893,0.214286,0.207792,0.208791,0.023576,0.127193,0.0,0.232365,0.262658,0.342803,0.194175,0.91,0.279070,0.172414,0.714286,6
4894,0.112245,0.272727,0.329670,0.019646,0.043860,0.0,0.149378,0.259494,0.099747,0.533981,0.24,0.186047,0.758621,0.142857,7


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness
0,0.28,0.246753,0.395604,0.789784,0.155844,0.0,0.356846,0.480597,0.876894,0.240741,0.46,0.267442,0.129032,0.857143
1,0.21,0.285714,0.373626,0.039293,0.173160,0.0,0.099585,0.367164,0.434975,0.518519,0.56,0.313953,0.241935,0.571429
2,0.39,0.259740,0.439560,0.247544,0.177489,0.0,0.232365,0.262687,0.504419,0.481481,0.56,0.255814,0.338710,0.571429
3,0.30,0.194805,0.351648,0.310413,0.212121,0.0,0.373444,0.528358,0.535985,0.416667,0.53,0.209302,0.306452,0.714286
4,0.30,0.194805,0.351648,0.310413,0.212121,0.0,0.373444,0.528358,0.535985,0.416667,0.52,0.209302,0.306452,0.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,0.20,0.168831,0.318681,0.039293,0.129870,0.0,0.182573,0.247761,0.254419,0.490741,0.26,0.325581,0.516129,0.428571
4892,0.24,0.311688,0.395604,0.290766,0.164502,0.0,0.456432,0.474627,0.491793,0.379630,0.85,0.279070,0.258065,0.714286
4893,0.23,0.207792,0.208791,0.023576,0.138528,0.0,0.232365,0.304478,0.342803,0.231481,0.91,0.279070,0.225806,0.714286
4894,0.13,0.272727,0.329670,0.019646,0.056277,0.0,0.149378,0.301493,0.099747,0.555556,0.24,0.186047,0.774194,0.142857


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,magnesium,sulphates,alcohol,lightness,quality
0,0.28,0.246753,0.395604,0.789784,0.155844,0.0,0.356846,0.480597,0.876894,0.240741,0.46,0.267442,0.129032,0.857143,6
1,0.21,0.285714,0.373626,0.039293,0.173160,0.0,0.099585,0.367164,0.434975,0.518519,0.56,0.313953,0.241935,0.571429,6
2,0.39,0.259740,0.439560,0.247544,0.177489,0.0,0.232365,0.262687,0.504419,0.481481,0.56,0.255814,0.338710,0.571429,6
3,0.30,0.194805,0.351648,0.310413,0.212121,0.0,0.373444,0.528358,0.535985,0.416667,0.53,0.209302,0.306452,0.714286,6
4,0.30,0.194805,0.351648,0.310413,0.212121,0.0,0.373444,0.528358,0.535985,0.416667,0.52,0.209302,0.306452,0.428571,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,0.20,0.168831,0.318681,0.039293,0.129870,0.0,0.182573,0.247761,0.254419,0.490741,0.26,0.325581,0.516129,0.428571,6
4892,0.24,0.311688,0.395604,0.290766,0.164502,0.0,0.456432,0.474627,0.491793,0.379630,0.85,0.279070,0.258065,0.714286,5
4893,0.23,0.207792,0.208791,0.023576,0.138528,0.0,0.232365,0.304478,0.342803,0.231481,0.91,0.279070,0.225806,0.714286,6
4894,0.13,0.272727,0.329670,0.019646,0.056277,0.0,0.149378,0.301493,0.099747,0.555556,0.24,0.186047,0.774194,0.142857,7


# Feature selection

calculate information gain and vif after handling missing values. Compare it to results from exploration phase

In [33]:
from datasets import get_kbest_sorted
from sklearn.feature_selection import mutual_info_regression

print('Information Gain without noise')
x = wine_white_without_noise.drop('quality', axis=1)
y = wine_white_without_noise['quality']
display(get_kbest_sorted(mutual_info_regression, x, y, 25))

print('Information Gain with calculated noise')
x = wine_white_with_calculated_noise.drop('quality', axis=1)
y = wine_white_with_calculated_noise['quality']
display(get_kbest_sorted(mutual_info_regression, x, y, 25))

Information Gain without noise


Unnamed: 0,score
alcohol,0.201305
density,0.196914
lightness,0.129563
total sulfur dioxide,0.094008
residual sugar,0.092005
chlorides,0.087542
citric acid,0.060735
volatile acidity,0.059102
free sulfur dioxide,0.041418
sulphates,0.032697


Information Gain with calculated noise


Unnamed: 0,score
density,0.192756
alcohol,0.159634
residual sugar,0.10002
lightness,0.089598
total sulfur dioxide,0.088743
chlorides,0.065587
free sulfur dioxide,0.057472
citric acid,0.056653
volatile acidity,0.047263
pH,0.031296


In [34]:
from datasets import get_vifs

x = wine_white_without_noise.drop('quality', axis=1)
vif_factors = get_vifs(x)
display('VIF Factors without noise', vif_factors)
x = wine_white_with_calculated_noise.drop('quality', axis=1)
vif_factors = get_vifs(x)
display('VIF Factors with calculated noise', vif_factors)

'VIF Factors without noise'

fixed acidity                 3.24
volatile acidity              1.12
citric acid                   1.14
residual sugar               16.25
chlorides                     1.27
flavanoids              4979530.50
free sulfur dioxide           1.78
total sulfur dioxide          2.23
density                      39.61
pH                            2.47
magnesium                     1.00
sulphates                     1.16
alcohol                      13.32
lightness                     3.58
dtype: float64

'VIF Factors with calculated noise'

fixed acidity                 2.28
volatile acidity              1.11
citric acid                   1.13
residual sugar                8.26
chlorides                     1.25
flavanoids              2526311.96
free sulfur dioxide           1.75
total sulfur dioxide          2.19
density                      20.51
pH                            1.76
magnesium                     1.00
sulphates                     1.11
alcohol                       8.89
lightness                     3.53
dtype: float64

missing values don't seem to have a big effect on information gain and vif

In [77]:
from datasets import get_logist_regression_kfold_score

X = wine_white.drop('quality', axis=1)
y = wine_white['quality']
score_before = get_logist_regression_kfold_score(X,y)
print('Score before: ', score_before)

selected_features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','magnesium','alcohol','lightness','quality']
selected_features = ['alcohol', 'density', 'total sulfur dioxide', 'chlorides', 'free sulfur dioxide', 'citric acid', 'volatile acidity', 'quality']
wine_white_without_noise_selected = wine_white_without_noise[selected_features]
wine_white_with_calculated_noise_selected = wine_white_with_calculated_noise[selected_features]

X = wine_white_without_noise_selected.drop('quality', axis=1)
y = wine_white_without_noise_selected['quality']

score_after = get_logist_regression_kfold_score(X,y)
print('Score after without noise: ', score_after)

X = wine_white_with_calculated_noise_selected.drop('quality', axis=1)
y = wine_white_with_calculated_noise_selected['quality']

score_after = get_logist_regression_kfold_score(X,y)
print('Score after with calculated noise: ', score_after)

# wine_white_without_noise = wine_white_without_noise[selected_features]
# wine_white_with_calculated_noise = wine_white_with_calculated_noise[selected_features]

Score before:  0.4567953915084276
Score after without noise:  0.5455596669750231
Score after with calculated noise:  0.5173885214422872


Compare result to PCA 

In [27]:
import numpy as np
import pandas as pd
from datasets import get_logist_regression_kfold_score
from sklearn import decomposition

#Perform PCA with 3 dimensions
num_components=3
pca = decomposition.PCA(n_components=2)
#Only use features
x=wine_white_without_noise.drop('quality', axis=1)
pca.fit(x)
x_tran = pca.transform(x)
y_tran = wine_white_without_noise['quality']

X = wine_white.drop('quality', axis=1)
y = wine_white['quality']
score_before = get_logist_regression_kfold_score(X,y)
print('Score before: ', score_before)
score_after = get_logist_regression_kfold_score(x_tran,y_tran)
print('Score after without noise: ', score_after)

pca = decomposition.PCA(n_components=2)
#Only use features
x=wine_white_with_calculated_noise.drop('quality', axis=1)
pca.fit(x)
x_tran = pca.transform(x)
y_tran = wine_white_with_calculated_noise['quality']

score_after = get_logist_regression_kfold_score(x_tran,y_tran)
print('Score after with calculated noise: ', score_after)

Score before:  0.4548366431774504
Score after without noise:  0.5166666666666667
Score after with calculated noise:  0.4847320093956865


## Forward Selection

In [88]:
from datasets import get_logist_regression_kfold_score

def perform_forward_selection(data, columns):
    score = 0
    columns_to_use = []
    for column in columns:
        columns_to_use.append(column)
        x = data[columns_to_use] 
        y = data['quality']
        score_temp = get_logist_regression_kfold_score(x,y)
        if score_temp - score < 0:
            columns_to_use.remove(column)
            print(f"skipping feature {column}")
            continue
        score = score_temp
        print(f"Score with features {columns_to_use}: {score}")
    return columns_to_use

columns = wine_white_without_noise.columns.tolist()
columns.remove('quality')
columns_to_use_without_noise = perform_forward_selection(wine_white_without_noise, columns)

columns = wine_white_with_calculated_noise.columns.tolist()
columns.remove('quality')
columns_to_use_with_calculated_noise = perform_forward_selection(wine_white_with_calculated_noise, columns)

Score with features ['fixed acidity']: 0.48694848694848697
Score with features ['fixed acidity', 'volatile acidity']: 0.49665049665049665
Score with features ['fixed acidity', 'volatile acidity', 'citric acid']: 0.4982674982674983
skipping feature residual sugar
Score with features ['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides']: 0.5003465003465003
Score with features ['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides', 'flavanoids']: 0.5038115038115039
skipping feature free sulfur dioxide
skipping feature total sulfur dioxide
skipping feature density
skipping feature pH
skipping feature magnesium
skipping feature sulphates
Score with features ['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides', 'flavanoids', 'alcohol']: 0.5470085470085471
Score with features ['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides', 'flavanoids', 'alcohol', 'lightness']: 0.5488565488565489
Score with features ['fixed acidity']: 0.4533248081841432
Scor

# Evaluation

In [89]:
from datasets import wine_white_dataset, get_logist_regression_kfold_score

wine_white_pure = wine_white_dataset().dropna()

x = wine_white_pure.drop('quality', axis=1).drop('ID', axis=1)
y = wine_white_pure['quality']
score_before = get_logist_regression_kfold_score(x,y)
print('Score before preprocessing', score_before)

x = wine_white_without_noise.drop('quality', axis=1)
y = wine_white_without_noise['quality']
score_after = get_logist_regression_kfold_score(x,y)
print('Score after preprocessing without feature selection without noise', score_after)

x = wine_white_with_calculated_noise.drop('quality', axis=1)
y = wine_white_with_calculated_noise['quality']
score_after = get_logist_regression_kfold_score(x,y)
print('Score after preprocessing without feature selection calculated noise', score_after)

x = wine_white_without_noise[columns_to_use_without_noise]
y = wine_white_without_noise['quality']
score_after = get_logist_regression_kfold_score(x,y)
print('Score after preprocessing forward selection without noise', score_after)

x = wine_white_without_noise[columns_to_use_with_calculated_noise]
y = wine_white_without_noise['quality']
score_after = get_logist_regression_kfold_score(x,y)
print('Score after preprocessing forward selection with calculated noise', score_after)

x = wine_white_without_noise[selected_features].drop('quality', axis=1)
y = wine_white_without_noise['quality']
score_after = get_logist_regression_kfold_score(x,y)
print('Score after preprocessing feature selection without noise', score_after)

x = wine_white_with_calculated_noise[selected_features].drop('quality', axis=1)
y = wine_white_with_calculated_noise['quality']
score_after = get_logist_regression_kfold_score(x,y)
print('Score after preprocessing feature selection with calculated noise', score_after)



Score before preprocessing 0.46253071253071254
Score after preprocessing without feature selection without noise 0.5416955416955417
Score after preprocessing without feature selection calculated noise 0.5066069906223359
Score after preprocessing forward selection without noise 0.5488565488565489
Score after preprocessing forward selection with calculated noise 0.5370755370755371
Score after preprocessing feature selection without noise 0.5403095403095403
Score after preprocessing feature selection with calculated noise 0.5166240409207161
