In [4]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras


In [None]:
layers = keras.layers

path = "./train.csv"
data = pd.read_csv(path)

# data = data.sample(frac=1)


In [None]:
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['points'])]
data = data.drop(data.columns[0], axis=1)

variety_threshold = 500 # drop anything that occurs less than this
value_counts = data['variety'].value_counts()
items_to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(items_to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [8]:
train_size = int(len(data) * .8)


64856


In [11]:
# Train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

In [12]:
# Train labels
labels_train = data['points'][:train_size]

In [13]:
# Test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

In [14]:
# Test labels
labels_test = data['points'][train_size:]

In [15]:
# Create a tokenizer to preprocess our text descriptions
vocab_size = 12000 # This is a hyperparameter, experiment with different values for your dataset
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(description_train) # only fit on train

In [16]:
# Wide feature 1: sparse bag of words (bow) vocab_size vector 
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [17]:
# Wide feature 2: one-hot vector of variety categories

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

# Convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [18]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

In [20]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [26]:
# Deep model feature: word embeddings of wine descriptions
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding="post")


In [27]:
deep_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

In [3]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)

combined_model.compile(loss='mse',
                       optimizer='sgd',
                       metrics=['accuracy'])

NameError: name 'layers' is not defined

In [30]:
# Run training
combined_model.fit([description_bow_train, variety_train] + [train_embed], labels_train, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras._impl.keras.callbacks.History at 0x1a27c50208>

In [31]:
combined_model.evaluate([description_bow_test, variety_test] + [test_embed], labels_test, batch_size=128)



[3.0568897912369066, 0.2347970889612207]

In [32]:
# Generate predictions
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])

In [33]:
# Compare predictions with actual values for the first few items in our test dataset
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(description_test.iloc[i])
    print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

Almost floral glints appear between citrus zest and green pear. This aromatic charm continues on the palate, which is slender and compact but buffered with yeasty generosity. This is immensely enjoyable and extremely balanced.
Predicted:  91.8679 Actual:  94 

Mostly a blend of Merlot (30%), Syrah (25%) and Cabernet Sauvignon (24%), this wine offers appealing, downright fruity aromas of red currant and raspberry jam. The lightly sweet cranberry and cherry flavors provide a lot of easy-drinking enjoyment.
Predicted:  87.147 Actual:  88 

Attractively fresh and herbaceous, this is a bright and fruity wine. It has flavors of red currant and grapefruit, plus a hint of gooseberry. Full and crisp, it's a wine to drink soon.
Predicted:  85.3879 Actual:  88 

An apple pie type of wine, meaning it's got baked apple, cinnamon and pastry aromas and flavors. The palate is kind of soft and sticky, and with little vital acidity it struggles to keep up. Standard warm-climate Chardonnay.
Predicted:  8

In [34]:
# Compare the average difference between actual price and the model's predicted price
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  1.36592903137
