# Predict the quality of Wine based on the description with pre trained Word Embeddings


## Load and explore data

In [None]:
import gensim.models
import gensim
import numpy as np
import pandas as pd
import re

df = pd.read_csv('data/winemag-data-130k-v2.csv')
df.head()

In [None]:
df.info()
df.points.value_counts()

In [None]:
# split points into binary label (80-89 = bad, 90-99 = good)
df['label'] = df['points'].apply(lambda x: 'good' if x > 89 else 'bad')

df.head()

## Modelling

In [None]:
clean_txt = []

# TODO: find a better cleaning approach!
for w in range(len(df.description)):
    # make text lower case
    desc = df['description'][w].lower()

    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)

    #remove tags
    desc = re.sub('&lt;/?.*?&gt;', ' &lt;&gt; ', desc)

    #remove digits and special chars
    desc = re.sub('(\\d|\\W)+', ' ', desc)
    clean_txt.append(desc)

df['clean_desc'] = clean_txt
df.head()

In [None]:
corpus = []
for col in df.clean_desc:
    word_list = col.split(' ')
    corpus.append(word_list)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

X = df['clean_desc']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def sentence_to_vector(sentence, word_embedding):
    vector = []
    for word in sentence.split():
        if word in word_embedding.wv:
            vector.append(word_embedding.wv[word])
    return np.mean(vector, axis=0)

def train_bernoulli(word_embedding_path):
    word_embedding = gensim.models.KeyedVectors.load(word_embedding_path)

    train_vectors = [sentence_to_vector(sentence, word_embedding) for sentence in X_train]
    test_vectors = [sentence_to_vector(sentence, word_embedding) for sentence in X_test]

    nb_model = BernoulliNB()
    nb_model.fit(train_vectors, y_train)

    y_pred = nb_model.predict(test_vectors)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')
    return nb_model

In [None]:
# TODO: Continue that later https://machinelearningmastery.com/develop-word-embeddings-python-gensim
# Other Pre Trained Embeddings: https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

# This will download the google news model (which is around 1.7GB) so only run this once. Afterwards it will be saved to your disk for future usage.
#import gensim.downloader
#google_news_model = gensim.downloader.load('word2vec-google-news-300')
#google_news_model.save('embeddings\word2vec-google-news-300.bin')
model_google_news = train_bernoulli('embeddings\word2vec-google-news-300.bin')