In [209]:
import pandas as pd
import numpy as np
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [210]:
#train.shape - 82657,12
#test.shape   - 20665,11

In [211]:
#train.dropna(axis = 0)
#test.dropna(axis = 0)

# We will be using the review description further as it is the main feature on which variety of grape depends as it gives information about it.

In [212]:
for i in range(1,5):
    print(train['variety'].iloc[i])
    print(train['review_description'].iloc[i])
    print()

Red Blend
This wine is near equal parts Syrah and Merlot with the balance Cabernet Sauvignon. Aromas of blue fruit, vanilla, cherry and herb lead to full-bodied pit-fruit flavors that bring a sense of deliciousness that is hard to resist.

Nebbiolo
Barolo Conca opens with inky dark concentration and soothing aromas of black fruit, spice, cola, plum, prune and dried lavender buds. The nose presents those ethereal and delicate aromas, but in the mouth, it delivers thicker flavors of chocolate and mocha. Drink after 2018.

Bordeaux-style White Blend
It's impressive what a small addition of Sauvignon Gris and Muscadelle can do to a Sauvignon-Sémillon blend—it turns this into an exotic wine, with spice and lychee flavors.

Malbec
This ripe, sweet wine is rich and full of dried and fresh fruit flavors. It is spicy, with a touch of spirit on the palate as well as bold black-plum fruit, dense tannins and a sweet aftertaste. Drink from 2017.



In [213]:
import tensorflow as tf

In [214]:
#len(train['review_description'].min())-198
#len(train['review_description'].max())-338
#Keeping review description above 256

In [215]:
# length of dictionary
NUM_WORDS = 4000

# Length of each review
SEQ_LEN = 256

#create tokenizer for our data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS, oov_token='<UNK>')
tokenizer.fit_on_texts(train['review_description'])

#convert text data to numerical indexes
wine_seqs=tokenizer.texts_to_sequences(train['review_description'])

#pad data up to SEQ_LEN (note that we truncate if there are more than SEQ_LEN tokens)
wine_seqs=tf.keras.preprocessing.sequence.pad_sequences(wine_seqs, maxlen=SEQ_LEN, padding="post")

print(wine_seqs)
wine_seqs.shape

[[ 395  132   18 ...    0    0    0]
 [   7    9    8 ...    0    0    0]
 [1005    1  161 ...    0    0    0]
 ...
 [   7    8    4 ...    0    0    0]
 [  41  389    9 ...    0    0    0]
 [   4  179   40 ...    0    0    0]]


(82657, 256)

In [216]:
wine_labels=pd.DataFrame({'variety': train['variety']})
wine_labels=wine_labels.replace(' ', '_', regex=True)

wine_labels_list = []
for item in wine_labels['variety']:
    wine_labels_list.append(str(item))

label_tokenizer = tf.keras.preprocessing.text.Tokenizer(split=' ', filters='!"#$%&()*+,./:;<=>?@[\\]^`{|}~\t\n')
label_tokenizer.fit_on_texts(wine_labels_list)

print(label_tokenizer.word_index)

wine_label_seq = np.array(label_tokenizer.texts_to_sequences(wine_labels_list))
wine_label_seq.shape

{'pinot_noir': 1, 'chardonnay': 2, 'cabernet_sauvignon': 3, 'red_blend': 4, 'bordeaux-style_red_blend': 5, 'riesling': 6, 'sauvignon_blanc': 7, 'syrah': 8, 'rosé': 9, 'merlot': 10, 'nebbiolo': 11, 'zinfandel': 12, 'sangiovese': 13, 'malbec': 14, 'portuguese_red': 15, 'white_blend': 16, 'sparkling_blend': 17, 'tempranillo': 18, 'rhône-style_red_blend': 19, 'pinot_gris': 20, 'champagne_blend': 21, 'cabernet_franc': 22, 'grüner_veltliner': 23, 'portuguese_white': 24, 'pinot_grigio': 25, 'bordeaux-style_white_blend': 26, 'gewürztraminer': 27, 'gamay': 28}


(82657, 1)

In [217]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(wine_seqs,
                                                    wine_label_seq,
                                                    test_size=0.20,
                                                    random_state=42)
print('X_train:{} y_train:{}'.format(X_train.shape,y_train.shape))

X_train:(66125, 256) y_train:(66125, 1)


# Now we will be using Keras Sequential Model which is a stacked layer of neural networks

In [218]:
EMBEDDING_SIZE = 256
EMBEDDING_SIZE_2 = 64
EMBEDDING_SIZE_3 = 29 #28+1

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
    tf.keras.layers.Conv1D(128,5,activation='relu'), 
    tf.keras.layers.GlobalMaxPooling1D(), 
    tf.keras.layers.Dense(EMBEDDING_SIZE_2, activation='relu'),
    # Add a Dense layer with additional units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(EMBEDDING_SIZE_3, activation='softmax')
])

model.summary()

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 256)         1024000   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 128)         163968    
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 128)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_21 (Dense)             (None, 29)                1885      
Total params: 1,198,109
Trainable params: 1,198,109
Non-trainable params: 0
_________________________________________________________________


In [219]:
num_epochs = 3
history = model.fit(X_train,y_train, epochs=num_epochs, validation_data=(X_val,y_val), verbose=2)
loss, accuracy = model.evaluate(X_val, y_val)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Epoch 1/3
2067/2067 - 441s - loss: 1.3897 - accuracy: 0.5837 - val_loss: 1.0490 - val_accuracy: 0.6786
Epoch 2/3
2067/2067 - 395s - loss: 0.8682 - accuracy: 0.7293 - val_loss: 0.9977 - val_accuracy: 0.6963
Epoch 3/3
2067/2067 - 340s - loss: 0.6392 - accuracy: 0.7993 - val_loss: 1.0566 - val_accuracy: 0.6939
Loss:  1.0566279888153076
Accuracy:  0.6938664317131042


In [220]:
new_review = [test['review_description'][12]]
encoded_sample_pred_text = tokenizer.texts_to_sequences(new_review)
encoded_sample_pred_text = tf.keras.preprocessing.sequence.pad_sequences(encoded_sample_pred_text, maxlen=SEQ_LEN, padding="post")
predictions = model.predict(encoded_sample_pred_text)

In [221]:
reverse_label_index = dict([(value, key) for (key, value) in label_tokenizer.word_index.items()])

def decode_label(text):
    return ' '.join([reverse_label_index.get(i, 'Unknown') for i in text])
for n in reversed((np.argsort(predictions))[0]):
    predicted_id = [n]
    print("Guess: %s \n Probability: %f" %(decode_label(predicted_id).replace('_', ' '), 100*predictions[0][predicted_id][0]) + '%')
    break

Guess: rosé 
 Probability: 89.178485%


In [222]:
X_test = test['review_description']
encoded_sample_pred_text = tokenizer.texts_to_sequences(X_test)
encoded_sample_pred_text = tf.keras.preprocessing.sequence.pad_sequences(encoded_sample_pred_text, maxlen=SEQ_LEN, padding="post")
y_pred = model.predict(encoded_sample_pred_text)

In [223]:
y = []
for n in reversed((np.argsort(y_pred))[0]):
    predicted_id = [n]
    y.append(decode_label(predicted_id).replace('_',' '))
y_pred = np.array(y)

In [224]:
y_pred

array(['pinot noir', 'sangiovese', 'gamay', 'cabernet franc', 'red blend',
       'nebbiolo', 'pinot gris', 'zinfandel', 'bordeaux-style red blend',
       'merlot', 'rosé', 'tempranillo', 'sparkling blend',
       'rhône-style red blend', 'champagne blend', 'malbec',
       'portuguese red', 'cabernet sauvignon', 'syrah', 'pinot grigio',
       'gewürztraminer', 'white blend', 'chardonnay', 'riesling',
       'sauvignon blanc', 'grüner veltliner', 'portuguese white',
       'bordeaux-style white blend', 'Unknown'], dtype='<U26')

In [225]:
df = pd.DataFrame(y_pred)
df.columns=['variety']

pred = pd.concat([test, df], axis = 1)

pred.index=pred.user_name
pred=pred[['variety']]

pred.to_csv('prediction.csv')
