In [1]:
from datasets import wine_red_dataset, wine_white_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

wine_red = wine_red_dataset()
wine_white = wine_white_dataset()

In [2]:
# Temporarily remove missing values
wine_red.dropna(inplace=True)
wine_white.dropna(inplace=True)

In [3]:


# Split data into features and label
# Includes removal of ID because it is no independent variable
wine_red_features = wine_red.drop('ID', axis=1).drop('quality', axis=1)
wine_red_labels = wine_red['quality']

print(wine_red_features.shape)
print(wine_red_labels.shape)

# Split data in test 30% and training 70%
wine_red_features_train, wine_red_features_test, wine_red_labels_train, wine_red_labels_test = train_test_split(wine_red_features, wine_red_labels, train_size=0.7, test_size=0.3, random_state=42)

# Calculate model
logistic_regression = LogisticRegression().fit(wine_red_features_train, wine_red_labels_train)

# Put coefficients in a new data frame and display it
regression_coefficients = pd.DataFrame({"Feature":wine_red_features_train.columns.tolist(),"Coefficients":logistic_regression.coef_[0]})
display(regression_coefficients)

# Calculate score for model R2, 1 is best, negative is worst
print('Wine red model score:\n', logistic_regression.score(wine_red_features_test, wine_red_labels_test))



(1565, 14)
(1565,)


Unnamed: 0,Feature,Coefficients
0,fixed acidity,0.090454
1,volatile acidity,0.118453
2,citric acid,-0.019445
3,residual sugar,0.094667
4,chlorides,0.004804
5,flavanoids,0.00263
6,free sulfur dioxide,-0.089241
7,total sulfur dioxide,-0.023063
8,density,0.005025
9,pH,0.031035


Wine red model score:
 0.5468085106382978


Cross Validation - Red Wine

In [5]:
# Imports for the cross validation
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

#Computing the cross validated metrics
#Prepare logistic model
lr = LogisticRegression()
#5-fold cross validation
scores = cross_val_score(lr, wine_red_features, wine_red_labels, cv=5)

#Print cross validation accuracy score
print(" Cross validation accuracy scores:",scores)

#Perform cross-fold prediction, in our case with k=5
y_pred = cross_val_predict(lr,wine_red_features, wine_red_labels, cv=5)
wine_red['Prediction']=y_pred
print(wine_red)

Cross validation accuracy scores: [0.51757188 0.54632588 0.61661342 0.53035144 0.55271565]
          ID  fixed acidity  volatile acidity  citric acid  residual sugar  \
0        1.0            7.4              0.70         0.00             1.9   
1        2.0            7.8              0.88         0.00             2.6   
2        3.0            7.8              0.76         0.04             2.3   
3        4.0           11.2              0.28         0.56             1.9   
4        5.0            7.4              0.70         0.00             1.9   
...      ...            ...               ...          ...             ...   
1591  1594.0            6.8              0.62         0.08             1.9   
1593  1596.0            5.9              0.55         0.10             2.2   
1594  1597.0            6.3              0.51         0.13             2.3   
1595  1598.0            5.9              0.65         0.12             2.0   
1596  1599.0            6.0              0.31      

In [8]:
# Split data into features and label
# Includes removal of ID because it is no independent variable
wine_white_features = wine_white.drop('ID', axis=1).drop('quality', axis=1)
wine_white_labels = wine_white['quality']

print(wine_white_features.shape)
print(wine_white_labels.shape)

# Split data in test 30% and training 70%
wine_white_features_train, wine_white_features_test, wine_white_labels_train, wine_white_labels_test = train_test_split(wine_white_features, wine_white_labels, train_size=0.7, test_size=0.3, random_state=42)

# Calculate model
logistic_regression = LogisticRegression().fit(wine_white_features_train, wine_white_labels_train)

# Put coefficients in a new data frame and display it
regression_coefficients = pd.DataFrame({"Feature":wine_white_features_train.columns.tolist(),"Coefficients":logistic_regression.coef_[0]})
display(regression_coefficients)

# Calculate score for model R2, 1 is best, negative is worst
print('Wine white model score:\n', logistic_regression.score(wine_white_features_test, wine_white_labels_test))

(4884, 14)
(4884,)


Unnamed: 0,Feature,Coefficients
0,fixed acidity,-0.068953
1,volatile acidity,-0.002246
2,citric acid,-0.003756
3,residual sugar,-0.028306
4,chlorides,-0.000274
5,flavanoids,-0.00509
6,free sulfur dioxide,0.051101
7,total sulfur dioxide,-0.002172
8,density,-0.013249
9,pH,-0.043324


Wine white model score:
 0.4740791268758527


In [None]:
Cross Validation - White Wine

In [10]:
# Imports for the cross validation
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

#Computing the cross validated metrics
#Prepare logistic model
lr_white = LogisticRegression()
#5-fold cross validation
scores_white = cross_val_score(lr_white, wine_white_features, wine_white_labels, cv=5)

#Print cross validation accuracy score
print(" Cross validation accuracy scores:",scores_white)

#Perform cross-fold prediction, in our case with k=5
y_pred_white = cross_val_predict(lr_white,wine_white_features, wine_white_labels, cv=5)
wine_white['Prediction']=y_pred_white
print(wine_white)

Cross validation accuracy scores: [0.41453429 0.45547595 0.5087001  0.45752303 0.47336066]
        ID  fixed acidity  volatile acidity  citric acid  residual sugar  \
0        1            7.0              0.27         0.36            20.7   
1        2            6.3              0.30         0.34             1.6   
2        3            8.1              0.28         0.40             6.9   
3        4            7.2              0.23         0.32             8.5   
4        5            7.2              0.23         0.32             8.5   
...    ...            ...               ...          ...             ...   
4891  4894            6.2              0.21         0.29             1.6   
4892  4895            6.6              0.32         0.36             8.0   
4893  4896            6.5              0.24         0.19             1.2   
4894  4897            5.5              0.29         0.30             1.1   
4895  4898            6.0              0.21         0.38             0.8 