In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [3]:
import pandas as pd
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df1 = pd.read_csv("winequality-white.csv", sep=';')
df2 = pd.read_csv("winequality-red.csv", sep=';')
combinedf = pd.concat([df1,df2])
combinedf = combinedf.dropna(axis='columns', how='all')
# Drop the null rows
combinedf = combinedf.dropna()
combinedf.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
combinedf = combinedf.loc[\
(combinedf["quality"] < 8 ) & \
(combinedf["quality"] > 4 )  \
] 

In [6]:
combinedf.groupby('quality').count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079


# Select your features (columns)

In [7]:
# Set features. Drop koi disposition, y value. 
X = combinedf.drop(columns='quality')
#X = df.drop(columns='quality')
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


# Create a Train Test Split

Use `quality` for the y values

In [8]:
from sklearn.model_selection import train_test_split
#80% train, 20% test. y is this one column
#y = df['quality']
y = combinedf['quality']
#random state 42 will have same picks for x test and y test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.1)

In [9]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1530,6.6,0.44,0.09,2.2,0.063,9.0,18.0,0.99444,3.42,0.69,11.3
1064,8.2,0.74,0.09,2.0,0.067,5.0,10.0,0.99418,3.28,0.57,11.8
3196,6.4,0.22,0.38,9.1,0.044,35.0,127.0,0.99326,2.97,0.3,11.0
4736,6.0,0.16,0.36,1.6,0.042,13.0,61.0,0.99143,3.22,0.54,10.8
860,7.2,0.62,0.06,2.7,0.077,15.0,85.0,0.99746,3.51,0.54,9.5


# Pre-processing

Scale the data using LabelEncoder and MinMaxScaler

In [10]:
# Scale your data - see Activity 03/05
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test) 

In [11]:
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [13]:
y_train_categorical.shape

(5447, 3)

# Train the Model

In [14]:
# first, create a normal neural network with 11 inputs, 100 hidden nodes, and 3 outputs
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=11))
model.add(Dense(units=100, activation='relu'))
# model.add(Dense(units=1000, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

In [15]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               1200      
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 303       
Total params: 11,603
Trainable params: 11,603
Non-trainable params: 0
_________________________________________________________________


In [17]:
# set early stopping as callback
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
model.fit(
    X_train_scaled,
    y_train_categorical,
    callbacks=callbacks,
    epochs=60,
    shuffle=True,
    verbose=2
)

Epoch 1/60
 - 0s - loss: 0.9408 - accuracy: 0.5238
Epoch 2/60




 - 0s - loss: 0.8733 - accuracy: 0.5669
Epoch 3/60
 - 0s - loss: 0.8617 - accuracy: 0.5750
Epoch 4/60
 - 0s - loss: 0.8555 - accuracy: 0.5761
Epoch 5/60
 - 0s - loss: 0.8480 - accuracy: 0.5869
Epoch 6/60
 - 0s - loss: 0.8478 - accuracy: 0.5853
Epoch 7/60
 - 0s - loss: 0.8387 - accuracy: 0.5908
Epoch 8/60
 - 0s - loss: 0.8348 - accuracy: 0.5954
Epoch 9/60
 - 0s - loss: 0.8342 - accuracy: 0.5934
Epoch 10/60
 - 0s - loss: 0.8291 - accuracy: 0.5968
Epoch 11/60
 - 0s - loss: 0.8244 - accuracy: 0.5987
Epoch 12/60
 - 0s - loss: 0.8229 - accuracy: 0.6003
Epoch 13/60
 - 0s - loss: 0.8200 - accuracy: 0.5998
Epoch 14/60
 - 0s - loss: 0.8191 - accuracy: 0.6009
Epoch 15/60
 - 0s - loss: 0.8140 - accuracy: 0.6042
Epoch 16/60
 - 0s - loss: 0.8141 - accuracy: 0.6102
Epoch 17/60
 - 0s - loss: 0.8102 - accuracy: 0.6117
Epoch 18/60
 - 0s - loss: 0.8056 - accuracy: 0.6180
Epoch 19/60
 - 0s - loss: 0.8060 - accuracy: 0.6170
Epoch 20/60
 - 0s - loss: 0.8070 - accuracy: 0.6112
Epoch 21/60
 - 0s - loss: 0.801

<keras.callbacks.callbacks.History at 0x2a846a2b148>

In [18]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 0.8394173674850968, Accuracy: 0.6006600856781006


In [19]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [20]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: [6 5 5 6 5]
Actual Labels: [7, 6, 5, 7, 6]


# RESULTS

quality = 5,6,7

test size = .1

model.add(Dense(units=100, activation='relu', input_dim=11))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

Normal Neural Network - Loss: 0.816428262801847, Accuracy: 0.6056105494499207

In [21]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
# import joblib
# filename = 'redwinedeeplearning.sav'
# joblib.dump(model, filename)