In [1]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras


In [2]:
layers = keras.layers

path = "./train.csv"
data = pd.read_csv(path)


In [3]:
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['points'])]
data = data.drop(data.columns[0], axis=1)

variety_threshold = 500 # drop anything that occurs less than this
value_counts = data['variety'].value_counts()
items_to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(items_to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [4]:
train_size = int(len(data) * .8)


In [5]:
# Train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

In [6]:
# Train labels
labels_train = data['points'][:train_size]

In [7]:
# Test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

In [8]:
# Test labels
labels_test = data['points'][train_size:]

In [9]:
# Create a tokenizer to preprocess our text descriptions
vocab_size = 12000 # This is a hyperparameter, experiment with different values for your dataset
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(description_train) # only fit on train

In [10]:
# Wide feature 1: sparse bag of words (bow) vocab_size vector 
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [11]:
# Wide feature 2: one-hot vector of variety categories

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

# Convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [12]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

In [13]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [14]:
# Deep model feature: word embeddings of wine descriptions
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding="post")


In [15]:
# Define our deep model with the Functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)

In [16]:
deep_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

In [17]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)

combined_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

In [18]:
# Run training
combined_model.fit([description_bow_train, variety_train] + [train_embed], labels_train, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras._impl.keras.callbacks.History at 0x1a1ccfccf8>

In [19]:
combined_model.evaluate([description_bow_test, variety_test] + [test_embed], labels_test, batch_size=128)



[3.0197736104407853, 0.23461206362480386]

In [20]:
# Generate predictions
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])

In [21]:
# Compare predictions with actual values for the first few items in our test dataset
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(description_test.iloc[i])
    print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

With 50% of Arinto in the blend, this is rounder and richer in taste than many Vinhos Verdes. It does also have a crisp edge that comes from the citrus tang. The general effect is bright while full in the mouth from the delicious fruit. Drink now.
Predicted:  88.2601 Actual:  86 

Aromas of green wood, herb and cherry lead to ripe but chalky feeling fruit flavors. There's some interesting things going on but a sense of astringency and bitterness proves distracting.
Predicted:  84.7411 Actual:  86 

From vines that are at least 25 years old, this wine is rounded and soft. It has red fruit concentration, a lively wine that is also full in the mouth. It is a wine that could age a few more months and will be better from late 2016.
Predicted:  87.2306 Actual:  88 

Aromas of butterscotch, spice and red fruit are followed by fleshy fruit flavors. The palate shows a pleasing sense of balance, though the tannins bring a slight grit.
Predicted:  86.63 Actual:  89 

This is a smooth and soft win

In [22]:
# Compare the average difference between actual price and the model's predicted price
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  1.21240730286
