# Predict the quality of Wine based on the description with pre trained Word Embeddings


## Load data

In [19]:
import pandas as pd

df = pd.read_csv('data/winemag-data-130k-v2.csv')

In [20]:
# split points into binary label (80-89 = bad, 90-99 = good)
df['label'] = df['points'].apply(lambda x: 'good' if x > 89 else 'bad')

## Modelling

In [21]:
from util import cleanse_data

clean_txt = cleanse_data(df)
df['clean_desc'] = clean_txt

In [22]:
corpus = []
for col in df.clean_desc:
    word_list = col.split(' ')
    corpus.append(word_list)

In [23]:
from sklearn.model_selection import train_test_split

X = df['clean_desc']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Note: This will download the google news model (which is around 1.7GB) so only run this once. Afterwards it will be saved to your disk for future usage.
import gensim.downloader as api

#model = api.load('word2vec-google-news-300')
#model.save('embeddings\google-news-300.model')

In [25]:
from gensim.models import KeyedVectors

google_news_model = KeyedVectors.load('embeddings\google-news-300.model')

In [26]:
import numpy as np

def sentence_to_vector(sentence, word_embedding):
    vector = []
    for word in sentence.split():
        if word in word_embedding:
            vector.append(word_embedding[word])
    return np.mean(vector, axis=0)


train_vectors = [sentence_to_vector(sentence, google_news_model) for sentence in X_train]
test_vectors = [sentence_to_vector(sentence, google_news_model) for sentence in X_test]

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB

nb_model = BernoulliNB()
nb_model.fit(train_vectors, y_train)

y_pred = nb_model.predict(test_vectors)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6922485093287171


We can see that the google news word embedding does predict the quality slightly better than the custom word embedding. However, the tfidf approach still seems to work better for this use case. We could try other word embeddings or also other models to maybe find a better solution. But for this assignment were fine with the insights that we got by now. In the next step we try to predict the reviwers based on the review description. -> See Notebook 05.