# Setup

### import libraries
keras/tensorflow for machine learning, spacey for POS tagging
as well as a function to make things a little easier down the road

In [None]:
import keras
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
def encode(strng):
    return np.array(embed([strng])[0])

### load data
you can find the csv here: https://www.kaggle.com/zynicide/wine-reviews?select=winemag-data_first150k.csv

generate dense representations of the wine data and reviews

In [None]:
import pandas as pd
reviews = pd.read_csv("winemag-data_first150k.csv")['description']
revcopy = reviews
c = pd.read_csv('winemag-data_first150k.csv')['country']
d = pd.read_csv('winemag-data_first150k.csv')['designation']
b = pd.read_csv('winemag-data_first150k.csv')['province']
r = pd.read_csv('winemag-data_first150k.csv')['region_1']
v = pd.read_csv('winemag-data_first150k.csv')['variety']
w = pd.read_csv('winemag-data_first150k.csv')['winery']
p = pd.read_csv('winemag-data_first150k.csv')['price']

In [None]:
X = []
y = []
for i in range(len(c)):
    X.append(encode(str(c[i])+" "+str(d[i])+" "+str(r[i])+" "+str(v[i])+" "+str(w[i])+" Price: $"+str(p[i])+" "+str(b[i])))
    y.append(encode(reviews[i]))

### build and train MLP
multi-layer perceptron predicts dense representation of review based on the data about the wine

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(512))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1024))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(2048))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1024))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(512))
model.compile("adam", "mean_squared_error", metrics=["accuracy"])

In [None]:
model.fit(np.array(X), np.array(y), epochs=2, batch_size=64, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f0a022295c0>

In [None]:
def compute_distance(target, non_target):
    return np.linalg.norm(target-non_target)
def best_indexes(string, num=10, embedding=y):
    distances = [compute_distance(np.array(encode(string)), emb).tolist() for emb in embedding]
    ind = distances.index(np.array(distances).min(0).tolist())
    s = sorted(distances)
    s = list(set(s))
    return [distances.index(s2) for s2 in s[:num]]
def best_raw(string, num=10, embedding=y):
    distances = [compute_distance(np.array(string), emb).tolist() for emb in embedding]
    ind = distances.index(np.array(distances).min(0).tolist())
    s = sorted(distances)
    s = list(set(s)) 
    return [distances.index(s2) for s2 in s[:num]]

# tagging a wine
country - The country that the wine is from.

province - The province or state that the wine is from.

region - The wine growing area in a province or state (ie Napa, Willamette Valley).

winery - The winery that made the wine.

designation - The vineyard within the winery where the grapes that made the wine are from.

variety - The type of grapes used to make the wine (ie Pinot Noir).

price - The cost for a bottle of the wine.





In [None]:
#@title Create Tags
country = "France" #@param {type:"string"}
province = "Provence" #@param {type:"string"}
region = "Bandol" #@param {type:"string"}
winery = "Domaine de la B\xE9gude" #@param {type:"string"}
designation = "La Br\xFBlade" #@param {type:"string"}
variety = "Provence red blend" #@param {type:"string"}
price = "66.0" #@param {type:"string"}

# country = input("The country that the wine is from (Blank for Unsure): ")
# designation = input("The vineyard within the winery where the grapes that made the wine are from (Blank for Unsure): ")
# province = input("The province or state that the wine is from (Blank for Unsure): ")
# region = input("The wine growing area in a province or state (ie Napa, Willamette Valley)(Blank for Unsure): ")
# variety = input("The type of grapes used to make the wine (ie Pinot Noir)(Blank for Unsure): ")
# winery = input("The winery that made the wine (Blank for Unsure): ")
# price = input("The cost for a bottle of the wine (Blank for Unsure): ")

# all of these are combined and represented as one 512 dimensional dense representation
# fed to the mlp, and the result is another 512 dim vector
# that we can use to find the closest real review,
# extract adjectives from it to use as tags
# and move on with our day

input_data = country + " " + designation + " " + region + " " + variety + " " + winery + " Price: $"+price + " " + province
input_vect = encode(input_data)
matches = best_raw(model.predict(np.array([input_vect]))[0])

print("\n", "#"*30, "\n")
print("Best Match: ")
# print(revcopy[matches[0]]) # this will just print out the whole review referenced
tags = []
doc = nlp(revcopy[matches[0]])
for token in doc:
    if(token.pos_ == "ADJ"):
        # print(token.pos_, token.text) # this would show the part of speech of the word, then the word's text
        if(str(token.text) not in tags):
            tags.append(str(token.text).lower())
print(', '.join(tags))
print("\n", "#"*30, "\n")

# the real review for this wine was:
# """This is the top wine from La Bégude, named after the highest point in the vineyard at 1200 feet. It has structure, density and considerable acidity that is still calming down. With 18 months in wood, the wine has developing an extra richness and concentration. Produced by the Tari family, formerly of Château Giscours in Margaux, it is a wine made for aging. Drink from 2020."""


 ############################## 

Best Match: 
second, ripe, broad, full, bright, red, fresh, gentle

 ############################## 

