In [1]:
import numpy as np
import tensorflow as tf
import random as python_random

In [2]:
np.random.seed(1)
python_random.seed(12)
tf.random.set_seed(123)

In [3]:
# importing useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import gzip
import dill

import time 

import warnings
warnings.filterwarnings('ignore')

# importing ml libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# import nlp libraries
import spacy
from nltk.tokenize import sent_tokenize 
from gensim.models import Word2Vec 

# importing deep learning libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, GlobalMaxPool1D, Conv1D, Dropout, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

### Downloading the data from our github repository

In [4]:
url = 'https://raw.github.com/HamoyeHQ/stage-f-06-wine-tasting/master/data/top_20_varieties_df.zip'

In [5]:
df = pd.read_csv(url)

In [6]:
corpus = list(df['corpus'])

In [7]:
labels = list(df['variety'])

## Here is our workflow. It tells the whole story. (source: our github repo)

![workflow_image](https://raw.github.com/HamoyeHQ/stage-f-06-wine-tasting/master/images/final_model_workflow.png)

# 1.0 Data Preprocessing

In [8]:
# creating a spacy pipeline and disabling tagger, parser and ner to speed up tokenizer
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) 

In [9]:
spacy_stop_words = spacy.lang.en.STOP_WORDS # getting spacy's stop-words

In [10]:
# downloading yoast_stop_words to be included to spacy's stop words to improve performance
response = requests.get('https://raw.github.com/Yoast/YoastSEO.js/develop/src/config/stopwords.js')
yoast_stop_words = response.content.decode()

In [11]:
yoast_stop_words

'/** @module config/stopwords */\n\n/**\n * Returns an array with stopwords to be used by the analyzer.\n *\n * @returns {Array} stopwords The array filled with stopwords.\n */\nexport default function() {\n\treturn [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he\'d", "he\'ll", "he\'s", "her", "here", "here\'s", "hers", "herself", "him", "himself", "his", "how", "how\'s", "i", "i\'d", "i\'ll", "i\'m", "i\'ve", "if", "in", "into", "is", "it", "it\'s", "its", "itself", "let\'s", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she\'d", "she\'ll", "she\'s", "should", "so", "some", "such", "than", "t

In [12]:
pattern = r'\[.+\]'
match = re.search(pattern, yoast_stop_words)
yoast_stop_words = set(match.group()[1:-1].replace('"', '').replace(',', '').split())
print(f'length of yoast_stop_words is {len(yoast_stop_words)}\n')
print(yoast_stop_words)

length of yoast_stop_words is 153

{'am', 'a', 'where', 'me', 'with', 'are', 'it', "it's", "you've", 'as', "she'll", "they'd", 'our', 'yourselves', 'because', "we've", "they've", "she'd", 'or', 'he', 'who', "you'll", "let's", "how's", 'both', "we'll", 'had', "i'll", 'nor', 'on', 'is', 'that', 'after', 'then', 'were', 'further', 'before', 'its', 'hers', 'have', 'of', 'my', 'how', "why's", 'being', 'out', 'all', 'whom', 'for', 'did', "i'd", 'herself', 'this', 'should', 'yourself', 'most', 'his', 'their', "you're", 'an', "he'll", 'few', 'if', 'during', 'why', 'these', "they'll", 'some', "when's", 'very', 'be', 'your', "there's", 'would', "he'd", 'theirs', 'down', 'too', "that's", 'but', 'until', 'themselves', 'from', "he's", 'them', "she's", 'same', 'which', 'than', 'to', 'doing', 'against', "what's", 'her', 'once', 'i', 'other', 'in', "we're", 'by', 'again', 'and', 'own', 'was', "where's", "you'd", 'does', 'myself', 'each', 'here', 'only', 'over', 'the', 'when', 'into', 'about', 'do', 'o

In [13]:
stop_words_lemma = {word.lemma_.lower() for word in nlp(' '.join(spacy_stop_words | yoast_stop_words))} | \
{'-pron-', '10', '12', 'aah', 'aa', 'ab', 'aaa', 'aand', '16', '2', '20', '30', '4', '40', '5', '6', '7', \
 '8', '9'}

## Creating custom transformers to encapsulate our data preprocessing

In [14]:
class GetTokens(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=stop_words_lemma):
        self.stop_words = stop_words
    
    # defining tokenzer function to tokenize the lower case lemma of documents in a corpus and 
    # filter out stop-words  
    def tokenize(self, text):
        return [word.lemma_.lower() for word in nlp(text) if word.is_alpha and word.lemma_.lower() \
                not in self.stop_words]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.tokens = [self.tokenize(doc) for doc in X]
            
        return self.tokens

In [15]:
tokens = GetTokens()

In [16]:
class Text2Sequence(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sequence_tokenizer = Tokenizer(oov_token='<oov>')

    def fit(self, X, y=None):
        self.sequence_tokenizer.fit_on_texts(X)
        self.words_indices = self.sequence_tokenizer.word_index
        return self
    
    def transform(self, X):
        self.get_sequences = self.sequence_tokenizer.texts_to_sequences(X)
        return self.get_sequences

In [17]:
text_2_seq = Text2Sequence()

In [18]:
class Padding(BaseEstimator, TransformerMixin):
    def __init__(self, pad='post'):
        self.pad = pad
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.get_paddings = pad_sequences(X, padding=self.pad)
        return self.get_paddings

In [19]:
pad = Padding()

In [20]:
data_prep_pipe = Pipeline([('get_tokens', tokens), ('text_2_sequence', text_2_seq), ('padding', pad)], verbose=1)

In [21]:
le = LabelEncoder()
one_hot = OneHotEncoder(sparse=False) 
        
encoded_labels = le.fit_transform(labels)
one_hot_labels = one_hot.fit_transform(encoded_labels.reshape(-1, 1))

In [22]:
X_prep = data_prep_pipe.fit_transform(corpus) # getting processed corpus

[Pipeline] ........ (step 1 of 3) Processing get_tokens, total= 1.1min
[Pipeline] ... (step 2 of 3) Processing text_2_sequence, total=   6.3s
[Pipeline] ........... (step 3 of 3) Processing padding, total=   1.7s


In [23]:
# saving data preparation pipeline
with gzip.open("wine_tasting_data_prep.dill.gz", "wb") as f:
    dill.dump(data_prep_pipe, f, recurse=True)

# 2.0 Training

In [24]:
# defining a function to return the emdedding matrix of our word2vec
def get_embedding_matrix(model, word_index):
    vocab_size = len(word_index) + 1
    embedding_dim = model.wv.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word in model.wv.vocab:
        ind = word_index[word]
        embedding_matrix[ind] = model[word]
        
    return embedding_matrix

In [25]:
def build_cnn_model(embedding_matrix, input_length):
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], 
                           weights=[embedding_matrix], 
                           input_length=input_length,
                           mask_zero=True,
                           trainable=False))
    
    model.add(Conv1D(128, 3, activation='relu'))
    model.add(Conv1D(128, 3, activation='relu'))
    
    model.add(GlobalMaxPool1D())
    
    model.add(Dropout(0.2))
    
    model.add(Dense(20, activation='softmax'))

    model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy')

    return model

In [26]:
def build_lstm_model(embedding_matrix, input_length):
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], 
                           weights=[embedding_matrix], 
                           input_length=input_length,
                           mask_zero=True,
                           trainable=False))
    
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(300)))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

    return model

In [27]:
# setting class weights due to class imbalance
class_weights = class_weight.compute_class_weight('balanced', np.arange(20), encoded_labels)
class_weights = dict(enumerate(class_weights))

### Disguising CNN and LSTM models as transformers

In [28]:
class NLPModel(BaseEstimator, TransformerMixin):
    def __init__(self, build_fn, name, epochs=7, batch_size=128, verbose=0):
        self.build_fn = build_fn
        self.name = name
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        
    def fit(self, X, y):
        self.corpus = data_prep_pipe.named_steps['get_tokens'].tokens
        
        t1 = time.time()
        self.w2v_model = Word2Vec(self.corpus, size=300, min_count=1, iter=10)
                
        print('Done training Word2Vec for {}                    total: {}mins'.format(self.name, \
                                                                          round((time.time()-t1)/60, 1)))
        
        self.embedding_matrix = get_embedding_matrix(self.w2v_model, \
                                                     data_prep_pipe.named_steps['text_2_sequence'].words_indices)
        
        self.model = self.build_fn(self.embedding_matrix, X.shape[1])
        
        t2 = time.time()
        
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, \
                                      class_weight=class_weights, verbose=self.verbose)
        
        print('Done training {} model                           total: {}mins'.format(self.name, \
                                                                          round((time.time()-t2)/60, 1)))
        
        return self
    
    def transform(self, X):
        self.pred = self.model.predict(X)
        return self.pred   
        

In [29]:
cnn_model = NLPModel(build_cnn_model, 'cnn_model', epochs=7) # instatiating cnn model object

In [30]:
lstm_model = NLPModel(build_lstm_model, 'lstm_model', epochs=10) # instatiating cnn model object

In [31]:
union = FeatureUnion([('cnn_model', cnn_model), ('lstm_model', lstm_model)]) # union of cnn and lstm

In [32]:
# building a MLP model as our blender
def build_blender():
    model = Sequential()

    model.add(Dense(30, activation='relu')) # this layer will find the best combination of CNN and LSTM
    model.add(Dense(20, activation='softmax')) 
    
    model.compile(loss='sparse_categorical_crossentropy',optimizer=Adam(1e-2))

    return model

In [33]:
# making the blender a scikit-learn classifier
class BlenderModel(BaseEstimator, ClassifierMixin):
    def __init__(self, build_fn, epochs=9, batch_size=128, verbose=0):
        self.build_fn = build_fn
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        
    def fit(self, X, y):
        
        self.model = self.build_fn()          
        self.model.fit(X, y, epochs=self.epochs, \
                                         batch_size=self.batch_size, verbose=self.verbose)
        
        return self
        
    def predict(self, X):
        self.ypred = self.model.predict_proba(X)
        
        return self.ypred

In [34]:
blender = BlenderModel(build_blender)

In [35]:
model_pipe = Pipeline([('union', union), ('blender', blender)], verbose=1)

In [36]:
model_pipe.fit(X_prep, encoded_labels);

Done training Word2Vec for cnn_model                    total: 1.4mins
Done training cnn_model model                           total: 0.9mins
Done training Word2Vec for lstm_model                    total: 1.4mins
Done training lstm_model model                           total: 5.5mins
[Pipeline] ............. (step 1 of 2) Processing union, total= 9.9min
[Pipeline] ........... (step 2 of 2) Processing blender, total=  13.5s


In [37]:
# saving embedding matrix
with gzip.open('embedding_matrix.dill.gz', 'wb') as emb:
    dill.dump(cnn_model.embedding_matrix, emb) 

In [38]:
cnn_model.model.save_weights('cnn_model.hdf5') # saving cnn_model's weights

In [39]:
lstm_model.model.save_weights('lstm_model.hdf5') # saving lstm_model's weights

In [40]:
blender.model.save('blender.hdf5') # saving blender model

# 3.0 Prediction

In [41]:
def get_prediction(inp):
    input_length = 87
    with gzip.open("wine_tasting_data_prep.dill.gz", "rb") as prep:
        d_prep = dill.load(prep)
        
    with gzip.open("embedding_matrix.dill.gz", "rb") as emb:
        emb_m = dill.load(emb)
    
    cnn = build_cnn_model(emb_m, input_length)
    cnn.load_weights('cnn_model.hdf5')
    
    lstm = build_lstm_model(emb_m, input_length)
    lstm.load_weights('lstm_model.hdf5')
    
    blender = tf.keras.models.load_model('blender.hdf5')
    
    X_prep = d_prep.transform([inp])
    pred1 = cnn.predict(X_prep)
    pred2 = lstm.predict(X_prep)
    
    final_pred = blender.predict(np.concatenate((pred1, pred2), axis=1))
    
    top_5_pred = np.argsort(final_pred[0])[-1:-6:-1]
    
    print("The wine's variety is most likely {} with a probability of {}%".format(\
                                            le.inverse_transform(\
                                            [top_5_pred[0]])[0], round(final_pred[0][top_5_pred[0]]*100, 3)))
        
    print('\nOther possible varieties are:\n')

    for i in range(1, len(top_5_pred)):
        print('{} ==> {}%'.format(le.inverse_transform(\
                                [top_5_pred[i]])[0], round(final_pred[0][top_5_pred[i]]*100, 3)))

### Let's do a demo prediction of a majority class'Pinot Noir' wine variety from a recent review (published 12/1/2020) not in our training set from [wine ethusiast](https://www.winemag.com/buying-guide/albert-bichot-2018-domaine-du-pavilion-clos-des-marechaudes-grand-cru-corton/). As we know, adding info like country, designation, province, region1, region2, and winery as the last sentence of our wine description boosted our model's performance during training. We therefore advise users to input them also for best performance.

In [42]:
user_input = "A hint of woodsmoke combines with this wine's shy aromas of black cherry. The palate comes in \
with surprising grace and svelteness, presenting a filigreed but firm structure. Cherry aromas shimmering with \
Morello and Amarena are expressed against that frame, crunching slightly and heightened by vivid freshness. \
This wine has compact, elegant power and inherent conviction. Drink through 2050. France Domaine du Pavilion \
Clos des Maréchaudes Grand Cru Burgundy Corton Albert Bichot"

In [43]:
get_prediction(user_input)

The wine's variety is most likely Pinot Noir with a probability of 99.955%

Other possible varieties are:

Syrah ==> 0.027%
Red Blend ==> 0.007%
Malbec ==> 0.005%
Chardonnay ==> 0.005%


### Let's consider predicting a miniority class 'Pinot Gris' wine variety from a recent review (published 12/1/2020) not in our training set from [wine ethusiast](https://www.winemag.com/buying-guide/antiquum-farm-2019-daisy-pinot-gris-willamette-valley/)

In [44]:
user_input2 = "This breaks new ground for Oregon's signature white grape. It's rich and leesy, fragrant and \
full, a bonanza of apple, pear and melon fruits. There's a lingering trail of honeysuckle, and a lush \
minerality that underscores the freshness. A stunning wine. United States of America Diasy Oregon Willamette \
Valley Willamette Valley Antiquum Farm"

In [45]:
user_input2

"This breaks new ground for Oregon's signature white grape. It's rich and leesy, fragrant and full, a bonanza of apple, pear and melon fruits. There's a lingering trail of honeysuckle, and a lush minerality that underscores the freshness. A stunning wine. United States of America Diasy Oregon Willamette Valley Willamette Valley Antiquum Farm"

In [46]:
get_prediction(user_input2)

The wine's variety is most likely Pinot Gris with a probability of 98.634%

Other possible varieties are:

Sauvignon Blanc ==> 0.646%
Syrah ==> 0.232%
Merlot ==> 0.212%
Chardonnay ==> 0.156%


### ...and voila, our model's predictions were as good as right!