In [1]:
import numpy as np
import pandas as pd
import string

from collections import Counter

In [2]:
raw_data = pd.read_csv("data/winemag-data-130k-v2.csv")
raw_descriptions = raw_data['description']
raw_varieties = raw_data['variety']
raw_provinces = raw_data['province']
raw_points = raw_data['points']

In [3]:
valid_varieties = set(['pinot noir', 'chardonnay', 'cabernet sauvignon', 'red blend', 'bordeaux-style red blend', 'riesling', 'sauvignon blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese red', 'white blend', 'sparkling blend', 'tempranillo', 'rhône-style red blend', 'pinot gris', 'champagne blend', 'cabernet franc', 'grüner veltliner', 'portuguese white', 'bordeaux-style white blend', 'pinot grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz'])
excluded_words = set(['pinot', 'noir', 'chardonnay', 'cabernet', 'sauvignon', 'bordeaux-style', 'blend', 'riesling', 'sauvignon',  'blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese', 'tempranillo', 'rhône-style', 'pinot', 'gris', 'champagne', 'franc', 'grüner',  'veltliner', 'portuguese', 'grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz'])

In [4]:
# Extract rows with just the valid varieties

def process_description(des):
    processed_description = []
    
    table = str.maketrans({key: None for key in string.punctuation})
    des = des.translate(table)
    
    for word in des.split():
        word = word.lower()
        if word not in excluded_words:
            processed_description.append(word)
            
    return " ".join(processed_description)

data, labels = [], []
for i, variety in enumerate(raw_varieties):
    if type(variety) is not float:
        variety = variety.lower()
        if variety.lower() in valid_varieties:
            if type(raw_descriptions[i]) is not float:                
                data.append(process_description(raw_descriptions[i]))
                labels.append(variety)

print(len(data), len(labels))

105154 105154


In [5]:
# Print a sample of the data

print(data[:5])

['aromas include tropical fruit broom brimstone and dried herb the palate isnt overly expressive offering unripened apple citrus and dried sage alongside brisk acidity', 'this is ripe and fruity a wine that is smooth while still structured firm tannins are filled out with juicy red berry fruits and freshened with acidity its already drinkable although it will certainly be better from 2016', 'tart and snappy the flavors of lime flesh and rind dominate some green pineapple pokes through with crisp acidity underscoring the flavors the wine was all stainlesssteel fermented', 'pineapple rind lemon pith and orange blossom start off the aromas the palate is a bit more opulent with notes of honeydrizzled guava and mango giving way to a slightly astringent semidry finish', 'much like the regular bottling from 2012 this comes across as rather rough and tannic with rustic earthy herbal characteristics nonetheless if you think of it as a pleasantly unfussy country wine its a good companion to a he

In [6]:
# Split 80/20 training-test

stacked = np.hstack([np.array(data).reshape(-1, 1), np.array(labels).reshape(-1, 1)])
np.random.shuffle(stacked)

train_split = int(len(stacked) * 0.8)

train_data = stacked[:train_split, :1].reshape(-1,)
train_labels = stacked[:train_split, 1:].reshape(-1,)

test_data = stacked[train_split:, :1].reshape(-1,)
test_labels = stacked[train_split:, 1:].reshape(-1,)

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(84123,) (84123,)
(21031,) (21031,)


In [14]:
# Compile vocabulary

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

tf_idf_vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))

# tokenize and build vocab
tf_idf_vectorizer.fit(train_data)

# summarize
print(len(tf_idf_vectorizer.vocabulary_))

count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 2))
count_vectorizer.fit(train_data)

print(len(count_vectorizer.vocabulary_))

33486
613277


In [8]:
# Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

def multinomial_nb(vectorizer, train_data, train_labels, test_data, test_labels):
    clf = MultinomialNB()
    clf.fit(vectorizer.transform(train_data), train_labels)
    return clf.score(vectorizer.transform(test_data), test_labels)

In [9]:
# Dummy Classifier

from sklearn.dummy import DummyClassifier

def dummy_classifier(vectorizer, train_data, train_labels, test_data, test_labels):
    dummy_clf = DummyClassifier()
    dummy_clf.fit(vectorizer.transform(train_data), train_labels)
    return dummy_clf.score(vectorizer.transform(test_data), test_labels)

In [10]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

def logistic_regression(vectorizer, train_data, train_labels, test_data, test_labels):
    lr = LogisticRegression()
    lr.fit(vectorizer.transform(train_data), train_labels)
    return lr.score(vectorizer.transform(test_data), test_labels)

In [15]:
logistic_regression(count_vectorizer, train_data, train_labels, test_data, test_labels)

0.6545100090342827

In [20]:
def run_experiments():
    # Bag of words
    count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))
    count_vectorizer.fit(train_data)

    print(multinomial_nb(count_vectorizer, train_data, train_labels, test_data, test_labels))
    print(logistic_regression(count_vectorizer, train_data, train_labels, test_data, test_labels))

    # Bigrams
    count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 2))
    count_vectorizer.fit(train_data)

    print(multinomial_nb(count_vectorizer, train_data, train_labels, test_data, test_labels))
    print(logistic_regression(count_vectorizer, train_data, train_labels, test_data, test_labels))

In [21]:
run_experiments()

0.5386334458656269
0.6295944082544814
0.4781513004612239
0.6545100090342827
