In [46]:
import numpy as np
import pandas as pd
import string
import nltk
nltk.download('stopwords')

from imblearn.over_sampling import SMOTE

from nltk.stem.snowball import SnowballStemmer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/paperspace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
raw_data = pd.read_csv("data/winemag-data-130k-v2.csv")
raw_descriptions = raw_data['description']
raw_varieties = raw_data['variety']
raw_provinces = raw_data['province']
raw_points = raw_data['points']

In [48]:
len(set(raw_descriptions))

119955

In [49]:
valid_varieties = set(['pinot noir', 'chardonnay', 'cabernet sauvignon', 'riesling', 'sauvignon blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec']) #, 'portuguese red', 'white blend', 'sparkling blend', 'tempranillo', 'rhône-style red blend', 'pinot gris', 'champagne blend', 'cabernet franc', 'grüner veltliner', 'portuguese white', 'bordeaux-style white blend', 'pinot grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz'])
excluded_words = set(['pinot', 'noir', 'chardonnay', 'cabernet', 'sauvignon', 'bordeaux-style', 'blend', 'riesling', 'sauvignon',  'blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese', 'tempranillo', 'rhône-style', 'pinot', 'gris', 'champagne', 'franc', 'grüner',  'veltliner', 'portuguese', 'grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz', 'flavor', 'wine'])

varieties = ['pinot noir', 'chardonnay', 'cabernet sauvignon', 'riesling', 'sauvignon blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec']
label_to_idx = {word: idx for idx, word in enumerate(varieties)}
print(label_to_idx)

{'pinot noir': 0, 'chardonnay': 1, 'cabernet sauvignon': 2, 'riesling': 3, 'sauvignon blanc': 4, 'syrah': 5, 'rosé': 6, 'merlot': 7, 'nebbiolo': 8, 'zinfandel': 9, 'sangiovese': 10, 'malbec': 11}


In [50]:
# Extract rows with just the valid varieties

def process_description(des):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    processed_description = []
    
    table = str.maketrans({key: None for key in string.punctuation})
    des = des.translate(table)
    
    for word in des.split():
        word = word.lower()
        word = stemmer.stem(word)
        if word not in excluded_words:
            processed_description.append(word)
            
    return " ".join(processed_description)

data, labels = [], []
for i, variety in enumerate(raw_varieties):
    if type(variety) is not float:
        variety = variety.lower()
        if variety.lower() in valid_varieties:
            if type(raw_descriptions[i]) is not float:                
                data.append(process_description(raw_descriptions[i]))
                labels.append(variety)

print(len(data), len(labels))

66338 66338


In [51]:
# Print a sample of the data

print(data[:5])

['pineappl rind lemon pith and orang blossom start off the aroma the palat is a bit more opul with note of honeydrizzl guava and mango give way to a slight astring semidri finish', 'much like the regular bottl from 2012 this come across as rather rough and tannic with rustic earthi herbal characterist nonetheless if you think of it as a pleasant unfussi countri its a good companion to a hearti winter stew', 'soft suppl plum envelop an oaki structur in this support by 15 coffe and chocol complet the pictur finish strong at the end result in a valuepr of attract and immedi access', 'slight reduc this offer a chalki tannic backbon to an otherwis juici explos of rich black cherri the whole accent throughout by firm oak and cigar box', 'build on 150 year and six generat of winemak tradit the wineri trend toward a leaner style with the classic california buttercream aroma cut by tart green appl in this good everyday sip that rang from pear to bare ripe pineappl prove approach but not distinc

In [52]:
# Split 80/20 training-test

stacked = np.hstack([np.array(data).reshape(-1, 1), np.array(labels).reshape(-1, 1)])
np.random.shuffle(stacked)

train_split = int(len(stacked) * 0.8)

train_data = stacked[:train_split, :1].reshape(-1,)
train_labels = stacked[:train_split, 1:].reshape(-1,)

test_data = stacked[train_split:, :1].reshape(-1,)
test_labels = stacked[train_split:, 1:].reshape(-1,)

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(53070,) (53070,)
(13268,) (13268,)


In [53]:
def resample(data, labels):
    return SMOTE().fit_sample(train_data, train_labels)

In [54]:
# Compile vocabulary

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

tf_idf_vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))

# tokenize and build vocab
tf_idf_vectorizer.fit(train_data)

# summarize
print(len(tf_idf_vectorizer.vocabulary_))

count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 2))
count_vectorizer.fit(train_data)

print(len(count_vectorizer.vocabulary_))

19792
380249


In [55]:
# Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

def multinomial_nb(vectorizer, train_data, train_labels, test_data, test_labels):
    clf = MultinomialNB()
    clf.fit(vectorizer.transform(train_data), train_labels)
    return clf.score(vectorizer.transform(test_data), test_labels)

In [56]:
# Dummy Classifier

from sklearn.dummy import DummyClassifier

def dummy_classifier(vectorizer, train_data, train_labels, test_data, test_labels):
    dummy_clf = DummyClassifier()
    dummy_clf.fit(vectorizer.transform(train_data), train_labels)
    return dummy_clf.score(vectorizer.transform(test_data), test_labels)

In [57]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

def logistic_regression(vectorizer, train_data, train_labels, test_data, test_labels):
    lr = LogisticRegression(multi_class='multinomial', solver='newton-cg')
    lr.fit(vectorizer.transform(train_data), train_labels)
    return lr.score(vectorizer.transform(test_data), test_labels)

In [58]:
from sklearn.svm import LinearSVC

def svc(vectorizer, train_data, train_labels, test_data, test_labels):
    clf = LinearSVC()
    clf.fit(vectorizer.transform(train_data), train_labels)
    return clf.score(vectorizer.transform(test_data), test_labels)

In [59]:
def run_experiments():
    # Bag of words
    count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))
    count_vectorizer.fit(train_data)

    print(multinomial_nb(count_vectorizer, train_data, train_labels, test_data, test_labels))
    print(logistic_regression(count_vectorizer, train_data, train_labels, test_data, test_labels))
    print(svc(count_vectorizer, train_data, train_labels, test_data, test_labels))

    # Bigrams
    count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 2))
    count_vectorizer.fit(train_data)

    print(multinomial_nb(count_vectorizer, train_data, train_labels, test_data, test_labels))
    print(logistic_regression(count_vectorizer, train_data, train_labels, test_data, test_labels))
    print(svc(count_vectorizer, train_data, train_labels, test_data, test_labels))


In [60]:
run_experiments()

  (0, 50)	1
  (0, 407)	1
  (0, 558)	1
  (0, 787)	1
  (0, 1180)	1
  (0, 1448)	1
  (0, 1609)	1
  (0, 2994)	1
  (0, 3112)	1
  (0, 5191)	1
  (0, 5316)	1
  (0, 8164)	1
  (0, 11387)	1
  (0, 11913)	1
  (0, 12048)	1
  (0, 12359)	1
  (0, 13178)	1
  (0, 13858)	1
  (0, 15565)	1
  (0, 16500)	1
  (0, 17177)	1
  (0, 17223)	1
  (0, 18856)	1
  (0, 19378)	1
  (1, 1828)	1
  :	:
  (53068, 15987)	1
  (53068, 16116)	1
  (53068, 16219)	1
  (53068, 16610)	1
  (53068, 17659)	1
  (53068, 17718)	1
  (53068, 18643)	1
  (53068, 18708)	1
  (53068, 19068)	1
  (53068, 19446)	1
  (53069, 216)	1
  (53069, 655)	1
  (53069, 2122)	1
  (53069, 4317)	1
  (53069, 5191)	1
  (53069, 6688)	1
  (53069, 6811)	1
  (53069, 9497)	1
  (53069, 12359)	1
  (53069, 13460)	1
  (53069, 14081)	1
  (53069, 14378)	1
  (53069, 15428)	1
  (53069, 17576)	1
  (53069, 19633)	1


ValueError: Expected 2D array, got 1D array instead:
array=['this open with aroma of char earth violet and wild berri the straightforward palat offer black raspberri tart cherri and a hint of anis alongsid polish tannin its rather simpl for a barolo and near access drink 2016–2021'
 'this bold and fruiti carri scent and of cherri candi its a fine bottl for chill and share with summer picnic food'
 'a touch green and veget on the nose with hint of can pea and bell pepper fortun there enough lime and miner to keep it fresh feel crisp tight and wet with green note pervad core citrus finish pithi and zesti with mild bitter'
 ...
 'dilut oddbal aroma of blueberri and currant are render earthi and less than fulli clean by a whiff of compost this feel basic and jammi while floral berri settl on herbal and pepperi it finish lean'
 'this fabul has been with cab and petit verdot its incred complex in both and structur offer wave of blackberri and cherri currant dark chocol and toast that go on and on into a long spici finish the grape come from various sourc includ st helena and atlas peak show winemak tom hind masteri of the art of'
 'fruiti and ripe this is a tight very fresh the crispest appl and brightest lemon give a refresh shock to the palat the should probabl age for a few year so drink after 2014'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.