In [6]:
from datasets import wine_red_dataset, wine_white_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

wine_red = wine_red_dataset()
wine_red.dropna(how='all', inplace=True)
#display(wine_red)

wine_white = wine_white_dataset()

# Modeling-Missing Values

Information from the exploration: Missing values in the pH (15 values) and fixed acidity (17 values) data

Fixed acidity is a missing at random value: In our case, missing fixed acidity values can be calculated from the citric acid values.     PH seem to be missing completely at random: There is no major correlation with other features

Fixed acidity missing values are calculated by a multiple imputation because the estimation can provide
more realistic standard errors. The multiple Imputation creates multiple data
with different estimations. The results of different models are
averaged or combined.

In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from datasets import show_mvs

columns = ['fixed acidity','citric acid']
w_red_imp=wine_red[columns]
mvs = show_mvs(w_red_imp)

display(w_red_imp)

imp = IterativeImputer(max_iter=20)
transformed_x= imp.fit_transform(w_red_imp)
df_missing_v=pd.DataFrame(transformed_x, columns=columns)
display("Regression imputed fixed acidity: \n", df_missing_v.loc[mvs.index])
wine_red['fixed acidity'] = df_missing_v['fixed acidity']
show_mvs(wine_red)
print()

Unnamed: 0,fixed acidity,citric acid
22,,0.21
61,,0.49
97,,0.25
187,,0.1
274,,0.18
409,,0.49
410,,0.34
411,,0.35
412,,0.16
753,,0.1


Unnamed: 0,fixed acidity,citric acid
0,7.4,0.00
1,7.8,0.00
2,7.8,0.04
3,11.2,0.56
4,7.4,0.00
...,...,...
1592,6.2,0.08
1593,5.9,0.10
1594,6.3,0.13
1595,5.9,0.12


'Regression imputed fixed acidity: \n'

Unnamed: 0,fixed acidity,citric acid
22,7.956331,0.21
61,9.638865,0.49
97,8.196693,0.25
187,7.295336,0.1
274,7.77606,0.18
409,9.638865,0.49
410,8.737508,0.34
411,8.797598,0.35
412,7.655879,0.16
753,7.295336,0.1


Unnamed: 0,ID,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
11,12.0,7.5,0.5,0.36,6.1,0.07,0.53,17.0,102.0,1.0,,0.8,0.42,10.5,0.101,5.0
606,607.0,9.4,0.41,0.48,4.6,0.07,0.53,10.0,20.0,1.0,,0.79,0.8,12.2,0.088,7.0
607,608.0,8.8,0.48,0.41,3.3,0.09,0.53,26.0,52.0,1.0,,0.53,0.56,10.5,0.096,6.0
608,609.0,10.1,0.65,0.37,5.1,0.11,0.53,11.0,65.0,1.0,,0.64,0.21,10.4,0.105,6.0
948,949.0,8.9,0.12,0.45,1.8,0.08,0.53,10.0,21.0,1.0,,0.76,0.69,11.9,0.086,7.0
1162,1163.0,8.5,0.32,0.42,2.3,0.08,0.53,12.0,19.0,0.99,,0.71,0.39,11.8,0.093,7.0
1163,1164.0,9.0,0.79,0.24,1.7,0.08,0.53,10.0,21.0,1.0,,0.67,0.58,10.0,0.105,5.0
1164,1165.0,9.0,0.79,0.24,1.7,0.08,0.53,10.0,21.0,1.0,,0.67,0.79,10.0,0.102,5.0
1369,1370.0,6.6,0.61,0.0,1.6,0.07,0.53,4.0,8.0,0.99,,0.37,0.05,10.4,0.103,4.0
1370,1371.0,8.7,0.78,0.51,1.7,0.42,0.53,12.0,66.0,1.0,,1.17,0.91,9.2,0.111,5.0





PH seem to be missing completely at random: There is no major correlation with other features, average imputation is used because it handle missling completely at randoms, ML imputation is not used because the algorithm needs a missing at random, no regression imputation is used because the exploration showed that there is no correlation with other features. The median is used for the average impuation because it's not affected by outliers.

In [8]:
import numpy as np
from sklearn.impute import SimpleImputer
np.set_printoptions(threshold=np.inf)

x= wine_red[['pH']]
#Average Imputation using strategy='median'
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
imp_med.fit(x)
SimpleImputer()
ph_new = imp_med.transform(x)
#display(ph_new)
wine_red['pH']=ph_new
display(wine_red.head(10))

display(wine_red.isna().sum())


Unnamed: 0,ID,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,flavanoids,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,magnesium,alcohol,lightness,quality
0,1.0,7.4,0.7,0.0,1.9,0.08,0.53,11.0,34.0,1.0,3.51,0.56,0.86,9.4,0.109,5.0
1,2.0,7.8,0.88,0.0,2.6,0.1,0.53,25.0,67.0,1.0,3.2,0.68,0.56,9.8,0.107,5.0
2,3.0,7.8,0.76,0.04,2.3,0.09,0.53,15.0,54.0,1.0,3.26,0.65,0.47,9.8,0.106,5.0
3,4.0,11.2,0.28,0.56,1.9,0.08,0.53,17.0,60.0,1.0,3.16,0.58,0.33,9.8,0.111,6.0
4,5.0,7.4,0.7,0.0,1.9,0.08,0.53,11.0,34.0,1.0,3.51,0.56,0.91,9.4,0.107,5.0
5,6.0,7.4,0.66,0.0,1.8,0.08,0.53,13.0,40.0,1.0,3.51,0.56,0.52,9.4,0.109,5.0
6,7.0,7.9,0.6,0.06,1.6,0.07,0.53,15.0,59.0,1.0,3.3,0.46,0.92,9.4,0.108,5.0
7,8.0,7.3,0.65,0.0,1.2,0.07,0.53,15.0,21.0,0.99,3.39,0.47,0.89,10.0,0.104,7.0
8,9.0,7.8,0.58,0.02,2.0,0.07,0.53,9.0,18.0,1.0,3.36,0.57,0.1,9.5,0.108,7.0
9,10.0,7.5,0.5,0.36,6.1,0.07,0.53,17.0,102.0,1.0,3.35,0.8,0.57,10.5,0.1,5.0


ID                      0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
flavanoids              0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
magnesium               0
alcohol                 0
lightness               0
quality                 0
dtype: int64

# Noise Handling

In [49]:
#!!!!!!!!!!!dropna muss weg wenn die Missing Values behoben sind!!!!!!!


import pandas as pd
#This can be achieved using the cross_val_predict object
from sklearn.model_selection import cross_val_predict
from sklearn import svm

#Perform cross-fold prediction with k=3
clf = svm.SVC(kernel='linear', C=1)
y_pred = cross_val_predict(clf, wine_red.dropna().drop('ID', axis=1).drop('quality', axis=1), wine_red.dropna()[['quality']], cv=3)
#df = pd.DataFrame(wine_red.dropna(),columns=wine_red.drop('ID', axis=1).columns)
#display(y_pred)
df['Prediction']=y_pred
#display(df)

In [46]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

#Show differences between calculated quality of the model and quality values of the dataset
difference= df['Prediction'] - df['quality']
dfn=(difference == -2).sum()
dfp=(difference == 2).sum()
difference_threshold=dfn+dfp

#Print noises (threshold: difference bigger than 2)
print(difference_threshold)
#display(difference)

55


Our assumption: The found noise data are feature noises --> jedes feature nach diesem Vorgehen bearbeiten? Oder die 58 einfach Instance löschen oder???

In [39]:
#!!!!!!!!!!!dropna muss weg wenn die Missing Values behoben sind!!!!!!!



#Ensemble learning
#Our noise filter additionally requires ensemble learning with a majority voting approach
#This is a general example for this approach
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_predict

# Define 3 classifiers
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

# define data
x = wine_red.dropna().drop('ID', axis=1).drop('quality', axis=1)
y = wine_red.dropna()['quality']

# merge 3 classifiers into one voting classifier
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

# train voting classifier with k-fold method
y_pred = cross_val_predict(eclf1, x, y, cv=3)

# save predictions, original quality and correct prediction boolean in data frame
result = pd.DataFrame(y_pred, columns=['Prediction'])
result['quality'] = y
delta_treshold = 2.0
result['Correct Prediction']= abs(result['Prediction'] - result['quality']) < delta_treshold

display(result.shape)
display(result)

# select all incorrect predicted data
print('False predictions')
delta_result = result[result['Correct Prediction'] == False]
display(delta_result.shape)
display(delta_result)

(1580, 3)

Unnamed: 0,Prediction,quality,Correct Prediction
0,5.0,5.0,True
1,5.0,5.0,True
2,5.0,5.0,True
3,5.0,6.0,True
4,5.0,5.0,True
5,5.0,5.0,True
6,5.0,5.0,True
7,5.0,7.0,False
8,5.0,7.0,False
9,5.0,5.0,True


False predictions


(182, 3)

Unnamed: 0,Prediction,quality,Correct Prediction
7,5.0,7.0,False
8,5.0,7.0,False
16,5.0,7.0,False
22,5.0,,False
35,4.0,6.0,False
37,5.0,7.0,False
41,6.0,4.0,False
61,5.0,,False
62,5.0,7.0,False
94,6.0,4.0,False


## Outliers

Outlier Detection

In [None]:
from sklearn.ensemble import IsolationForest

# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
y_out = iso.fit_predict(X_train)

# select all rows that are not outliers (inlier=1, outlier=-1)
mask = y_out != -1
X_train_red, y_train_red = X_train[mask, :], y_train[mask]

# Inliers vs. Outliers
print("Inliers: ",X_train_red.shape[0],"Outliers",X_train.shape[0]-X_train_red.shape[0])

# fit the model
model = LinearRegression()
model.fit(X_train_red, y_train_red)

# evaluate the model
y_pred = model.predict(X_test)

# evaluate predictions
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %.3f' % mae)