In [34]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler



In [2]:
'DATA LOADING'
#Load the red wine dataframe
df_raw_red = pd.read_csv(Path('./winequality-red.csv'))
df_raw_red.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [3]:
#Load the white wine dataframe
df_raw_white = pd.read_csv(Path('./winequality-white.csv'))
df_raw_white.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6


In [4]:
'DATA CLEANING'
#check length of raw red
len(df_raw_red)

1599

In [5]:
#check length of raw white
len(df_raw_white)

4898

In [105]:
#add a wine type of red: 0
df_raw_red['type']=0
df_raw_red.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0


In [106]:
#add a wine type of white: 1
df_raw_white['type']=1
df_raw_white.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,1
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,1


In [107]:
#combine the dataframes, white and red together
df_combo_raw = df_raw_red.append(df_raw_white)

In [108]:
#sanity check it
df_combo_raw.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [109]:
#check types of data - good, all numbers
df_combo_raw.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
type                      int64
dtype: object

In [110]:
#check for missing values
count_nan = len(df_combo_raw) - df_combo_raw.count()
print(count_nan)

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
type                    0
dtype: int64


In [111]:
#check for number of quality targets - each wine has a quality score between 3 and 9
df_combo_raw.groupby(['quality']).count().head(20)

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,30,30,30,30,30,30,30,30,30,30,30,30
4,216,216,216,216,216,216,216,216,216,216,216,216
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
8,193,193,193,193,193,193,193,193,193,193,193,193
9,5,5,5,5,5,5,5,5,5,5,5,5


In [112]:
# there are too many values for the target. I will change each target to be High Quality, a value of 1, or Low Quality, a value of 0. High quality sits between 7,8,9. Low Quality sits between 3,4,5,6
df_copy = df_combo_raw.copy()
df_copy.loc[(df_copy.quality < 7),'quality']=0
df_copy.loc[(df_copy.quality > 6),'quality']=1

df_copy.groupby(['quality']).count().head(20)


Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,5220,5220,5220,5220,5220,5220,5220,5220,5220,5220,5220,5220
1,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277


In [113]:
#change the name of the DF for readability
df_combo_cleaned = df_copy        

In [114]:
#scale the data
data_scaler = StandardScaler()
df_combo_scaled = data_scaler.fit_transform(df_combo_cleaned.drop(columns="quality"))
df_combo_scaled

array([[ 0.14247327,  2.18883292, -2.19283252, ...,  0.19309677,
        -0.91546416, -1.75018984],
       [ 0.45103572,  3.28223494, -2.19283252, ...,  0.99957862,
        -0.58006813, -1.75018984],
       [ 0.45103572,  2.55330026, -1.91755268, ...,  0.79795816,
        -0.58006813, -1.75018984],
       ...,
       [-0.55179227, -0.6054167 , -0.88525328, ..., -0.47897144,
        -0.91546416,  0.57136659],
       [-1.32319841, -0.30169391, -0.12823371, ..., -1.016626  ,
         1.9354021 ,  0.57136659],
       [-0.93749534, -0.78765037,  0.42232597, ..., -1.41986693,
         1.09691202,  0.57136659]])

In [115]:
#check if data is scaled
import numpy as np
print(np.mean(df_combo_scaled[:,0]))
print(np.std(df_combo_scaled[:,0]))

-3.8496389562498884e-16
1.0


In [116]:
'MODEL CREATION'

'MODEL CREATION'

In [117]:
#define target and features
y = df_combo_cleaned["quality"]
X = df_combo_scaled

In [118]:
#train the data set
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [119]:
#check to see if there is enough, or too many
from collections import Counter

Counter(y_train)

Counter({0: 3914, 1: 958})

In [120]:
X_train.shape 

(4872, 12)

In [121]:
classifier = LogisticRegression(solver='lbfgs',
   max_iter=1000,
   random_state=1)

In [122]:
classifier.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [123]:
y_pred = classifier.predict(X_test)


In [124]:
#check accuracy of model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8141538461538461


In [125]:
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1239   67]
 [ 235   84]]


In [126]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      1306
           1       0.56      0.26      0.36       319

    accuracy                           0.81      1625
   macro avg       0.70      0.61      0.62      1625
weighted avg       0.78      0.81      0.79      1625

