In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow.python import keras
layers = keras.layers

# This code was tested with TensorFlow v1.7
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.7.0


In [43]:
# TODO: download the data here and copy it into your local directory: https://www.kaggle.com/zynicide/wine-reviews/data
data = pd.read_csv("winemag-data_first150k.csv")

In [44]:
# Shuffle the data
data = data.sample(frac=1)

# Print the first 5 rows
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
49599,49599,Germany,"This is a relatively tame Scheurebe, but one t...",Trocken,87,25.0,Pfalz,,,Scheurebe,Weegmüller
47016,47016,Argentina,"Tough cherry and plum aromas lead it off, foll...",Los Cardos,84,10.0,Mendoza Province,Mendoza,,Syrah,Doña Paula
140691,140691,US,Give this Mendocino-grown Riesling a bit more ...,McFadden Farms Dry,86,20.0,California,Potter Valley,Mendocino/Lake Counties,Riesling,Dashe Cellars
25123,25123,Italy,Created to celebrate a 50th wedding anniversar...,Nozze d'Oro,88,30.0,Sicily & Sardinia,Contea di Sclafani,,White Blend,Tasca d'Almerita
94707,94707,US,The blend is classic Bordeaux—85% Cabernet Sau...,,92,50.0,Washington,Columbia Valley (WA),Columbia Valley,Cabernet Sauvignon,Pepper Bridge


In [45]:
# Do some preprocessing to limit the # of wine varities in the dataset
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1) 

variety_threshold = 500 # Anything that occurs less than this will be removed.
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [46]:
# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 95646
Test size: 23912


In [47]:
# Train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

# Train labels
labels_train = data['price'][:train_size]

# Test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

# Test labels
labels_test = data['price'][train_size:]

In [48]:
# Create a tokenizer to preprocess our text descriptions
vocab_size = 100000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(description_train) # only fit on train

In [49]:
# Wide feature 1: sparse bag of words (bow) vocab_size vector 
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [50]:
# Wide feature 2: one-hot vector of variety categories

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

# Convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [51]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

In [52]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100000)       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 100040)       0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 256)          25610496    concatenate_1[0][0]              
__________

In [53]:
# Deep model feature: word embeddings of wine descriptions
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding="post")

In [54]:
# Define our deep model with the Functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 170, 8)            800000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1361      
Total params: 801,361
Trainable params: 801,361
Non-trainable params: 0
_________________________________________________________________
None


In [55]:
deep_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

In [56]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100000)       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 100040)       0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________

In [58]:
# Run training (need to run this for at least 5 epochs to get good accuracy, should probably do this on the cloud)
combined_model.fit([description_bow_train, variety_train] + [train_embed], labels_train, epochs=10, batch_size=256)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras._impl.keras.callbacks.History at 0x13e197c18>

In [59]:
combined_model.evaluate([description_bow_test, variety_test] + [test_embed], labels_test, batch_size=256)



[710.8771138196009, 0.059677149558314595]

In [60]:
combined_model.save('wine_model_2_features_10_epochs.h5')

In [61]:
# Generate predictions
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])

In [63]:
# Compare predictions with actual values for the first few items in our test dataset
for i in range(15):
    val = predictions[i]
    print(description_test.iloc[i])
    print(val[0], 'Actual: ', labels_test.iloc[i], '\n')

Fragrant apple blossoms and honey perfume this deliciously fresh and fruity Sauvignon Blanc. The dry palate is full bodied with a shower of white peach flavors marked by a quirky lemongrass note and a bright lemon-lime acidity.
19.881979 Actual:  15.0 

The wine is built around flavors of crisp citrus, white peach and green apple fruits, wrapped in a pleasing minerality. It's all quite fresh and nicely textured. Just a bit of citrus rind enlivens the lengthy finish.
24.543623 Actual:  25.0 

A rich nose of clove, tea, mint and cedar start this red from Greece. The palate offers clean bursts of cedar, spice and red currant. Fresh and light, the wine is a good house red.
19.474958 Actual:  21.0 

Ultraripe tropical fruits—even including hints of bananas—dominate this wine's heady aromas. Acids are soft, and alcohol fairly elevated—12.5%—for a wine of this prädikat. There's ample concentration, and the peach and melon flavors linger on the finish. Drink now–2016.
53.24802 Actual:  50.0 

