In [1]:
import numpy as np
import pandas as pd
import string
import nltk
nltk.download('stopwords')

from nltk.stem.snowball import SnowballStemmer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /home/nyc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
raw_data = pd.read_csv("data/winemag-data-130k-v2.csv")
raw_descriptions = raw_data['description']
raw_varieties = raw_data['variety']
raw_provinces = raw_data['province']
raw_points = raw_data['points']

In [3]:
valid_varieties = set(['pinot noir', 'chardonnay', 'cabernet sauvignon', 'red blend', 'bordeaux-style red blend', 'riesling', 'sauvignon blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese red', 'white blend', 'sparkling blend', 'tempranillo', 'rhône-style red blend', 'pinot gris', 'champagne blend', 'cabernet franc', 'grüner veltliner', 'portuguese white', 'bordeaux-style white blend', 'pinot grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz'])
excluded_words = set(['pinot', 'noir', 'chardonnay', 'cabernet', 'sauvignon', 'bordeaux-style', 'blend', 'riesling', 'sauvignon',  'blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese', 'tempranillo', 'rhône-style', 'pinot', 'gris', 'champagne', 'franc', 'grüner',  'veltliner', 'portuguese', 'grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz', 'flavor', 'wine'])

In [4]:
# Extract rows with just the valid varieties

def process_description(des):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    processed_description = []
    
    table = str.maketrans({key: None for key in string.punctuation})
    des = des.translate(table)
    
    for word in des.split():
        word = word.lower()
        word = stemmer.stem(word)
        if word not in excluded_words:
            processed_description.append(word)
            
    return " ".join(processed_description)

data, labels = [], []
for i, variety in enumerate(raw_varieties):
    if type(variety) is not float:
        variety = variety.lower()
        if variety.lower() in valid_varieties:
            if type(raw_descriptions[i]) is not float:                
                data.append(process_description(raw_descriptions[i]))
                labels.append(variety)

print(len(data), len(labels))

105154 105154


In [5]:
# Print a sample of the data

print(data[:5])

['aroma includ tropic fruit broom brimston and dri herb the palat isnt over express offer unripen appl citrus and dri sage alongsid brisk acid', 'this is ripe and fruiti a that is smooth while still structur firm tannin are fill out with juici red berri fruit and freshen with acid its alreadi drinkabl although it will certain be better from 2016', 'tart and snappi the of lime flesh and rind domin some green pineappl poke through with crisp acid underscor the the was all stainlesssteel ferment', 'pineappl rind lemon pith and orang blossom start off the aroma the palat is a bit more opul with note of honeydrizzl guava and mango give way to a slight astring semidri finish', 'much like the regular bottl from 2012 this come across as rather rough and tannic with rustic earthi herbal characterist nonetheless if you think of it as a pleasant unfussi countri its a good companion to a hearti winter stew']


In [6]:
# Split 80/20 training-test

stacked = np.hstack([np.array(data).reshape(-1, 1), np.array(labels).reshape(-1, 1)])
np.random.shuffle(stacked)

train_split = int(len(stacked) * 0.8)

train_data = stacked[:train_split, :1].reshape(-1,)
train_labels = stacked[:train_split, 1:].reshape(-1,)

test_data = stacked[train_split:, :1].reshape(-1,)
test_labels = stacked[train_split:, 1:].reshape(-1,)

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(84123,) (84123,)
(21031,) (21031,)


In [7]:
# Compile vocabulary

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

tf_idf_vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))

# tokenize and build vocab
tf_idf_vectorizer.fit(train_data)

# summarize
print(len(tf_idf_vectorizer.vocabulary_))

count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 2))
count_vectorizer.fit(train_data)

print(len(count_vectorizer.vocabulary_))

25665
513635


In [8]:
# Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

def multinomial_nb(vectorizer, train_data, train_labels, test_data, test_labels):
    clf = MultinomialNB()
    clf.fit(vectorizer.transform(train_data), train_labels)
    return clf.score(vectorizer.transform(test_data), test_labels)

In [9]:
# Dummy Classifier

from sklearn.dummy import DummyClassifier

def dummy_classifier(vectorizer, train_data, train_labels, test_data, test_labels):
    dummy_clf = DummyClassifier()
    dummy_clf.fit(vectorizer.transform(train_data), train_labels)
    return dummy_clf.score(vectorizer.transform(test_data), test_labels)

In [41]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

def logistic_regression(vectorizer, train_data, train_labels, test_data, test_labels):
    lr = LogisticRegression(multi_class='multinomial', solver='newton-cg')
    lr.fit(vectorizer.transform(train_data), train_labels)
    return lr.score(vectorizer.transform(test_data), test_labels)

In [42]:
def run_experiments():
    # Bag of words
    count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))
    count_vectorizer.fit(train_data)

    print(multinomial_nb(count_vectorizer, train_data, train_labels, test_data, test_labels))
    print(logistic_regression(count_vectorizer, train_data, train_labels, test_data, test_labels))

    # Bigrams
    count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 2))
    count_vectorizer.fit(train_data)

    print(multinomial_nb(count_vectorizer, train_data, train_labels, test_data, test_labels))
    print(logistic_regression(count_vectorizer, train_data, train_labels, test_data, test_labels))

In [43]:
run_experiments()

0.5490941942846275
0.6258380485949313
0.48666254576577433
0.6517046265037326
