In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
raw_data = pd.read_csv("data/winemag-data-130k-v2.csv")
raw_descriptions = raw_data['description']
raw_varieties = raw_data['variety']
raw_provinces = raw_data['province']
raw_points = raw_data['points']

In [3]:
valid_varieties = set(['pinot noir', 'chardonnay', 'cabernet sauvignon', 'red blend', 'bordeaux-style red blend', 'riesling', 'sauvignon blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese red', 'white blend', 'sparkling blend', 'tempranillo', 'rhône-style red blend', 'pinot gris', 'champagne blend', 'cabernet franc', 'grüner veltliner', 'portuguese white', 'bordeaux-style white blend', 'pinot grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz'])
excluded_words = set(['pinot', 'noir', 'chardonnay', 'cabernet', 'sauvignon', 'bordeaux-style', 'blend', 'riesling', 'sauvignon',  'blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese', 'tempranillo', 'rhône-style', 'pinot', 'gris', 'champagne', 'franc', 'grüner',  'veltliner', 'portuguese', 'grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz', 'flavor', 'wine'])

label_to_idx = {word: idx for idx, word in enumerate(valid_varieties)}
print(label_to_idx)

{'merlot': 0, 'sangiovese': 1, 'sparkling blend': 2, 'riesling': 3, 'portuguese white': 4, 'syrah': 5, 'grüner veltliner': 6, 'white blend': 7, 'portuguese red': 8, 'pinot noir': 9, 'red blend': 10, 'zinfandel': 11, 'gamay': 12, 'chardonnay': 13, 'gewürztraminer': 14, 'bordeaux-style white blend': 15, 'rhône-style red blend': 16, 'shiraz': 17, 'viognier': 18, 'pinot gris': 19, 'bordeaux-style red blend': 20, 'sauvignon blanc': 21, 'malbec': 22, 'cabernet franc': 23, 'rosé': 24, 'nebbiolo': 25, 'champagne blend': 26, 'pinot grigio': 27, 'tempranillo': 28, 'cabernet sauvignon': 29}


In [4]:
# Extract rows with just the valid varieties

def process_description(des):
    processed_description = []
    
    table = str.maketrans({key: None for key in string.punctuation})
    des = des.translate(table)
    
    for word in des.split():
        word = word.lower()
#         word = stemmer.stem(word)
        if word not in excluded_words:
            processed_description.append(word)
            
    return " ".join(processed_description)

data, labels = [], []

for i, variety in enumerate(raw_varieties):
    if type(variety) is not float:
        variety = variety.lower()
        if variety in valid_varieties:
            if type(raw_descriptions[i]) is not float:                
                data.append(process_description(raw_descriptions[i]))
                labels.append(label_to_idx[variety])

print(len(data), len(labels))

105154 105154


In [5]:
# Print a sample of the data

print(data[:5])

['aromas include tropical fruit broom brimstone and dried herb the palate isnt overly expressive offering unripened apple citrus and dried sage alongside brisk acidity', 'this is ripe and fruity a that is smooth while still structured firm tannins are filled out with juicy red berry fruits and freshened with acidity its already drinkable although it will certainly be better from 2016', 'tart and snappy the flavors of lime flesh and rind dominate some green pineapple pokes through with crisp acidity underscoring the flavors the was all stainlesssteel fermented', 'pineapple rind lemon pith and orange blossom start off the aromas the palate is a bit more opulent with notes of honeydrizzled guava and mango giving way to a slightly astringent semidry finish', 'much like the regular bottling from 2012 this comes across as rather rough and tannic with rustic earthy herbal characteristics nonetheless if you think of it as a pleasantly unfussy country its a good companion to a hearty winter ste

In [6]:
# Split 80/20 training-test

stacked = np.hstack([np.array(data).reshape(-1, 1), np.array(labels).reshape(-1, 1)])
np.random.shuffle(stacked)

train_split = int(len(stacked) * 0.8)

train_data = stacked[:train_split, :1].reshape(-1,)
train_labels = np.array(stacked[:train_split, 1:].reshape(-1,), dtype=np.int32)

test_data = stacked[train_split:, :1].reshape(-1,)
test_labels = np.array(stacked[train_split:, 1:].reshape(-1,), dtype= np.int32)

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(84123,) (84123,)
(21031,) (21031,)


In [34]:
from torch import nn
from torch.autograd import Variable
import torch
import torch.utils.data
import torch.nn.functional as F

In [8]:
glove_path = 'data/glove/glove.6B.50d.txt'

In [9]:
def load_glove(path):
    """
    creates a dictionary mapping words to vectors from a file in glove format.
    """
    with open(path) as f:
        glove = {}
        for line in f.readlines():
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            glove[word] = vector
        return glove

In [10]:
def load_glove_embeddings(path, word2idx, embedding_dim=50):
    with open(path) as f:
        embeddings = np.zeros((len(word2idx), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = word2idx.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
        return torch.from_numpy(embeddings).float()

In [11]:
%time glove = load_glove(glove_path)

CPU times: user 7.22 s, sys: 950 ms, total: 8.17 s
Wall time: 9.15 s


In [12]:
count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))
count_vectorizer.fit(train_data)
vocab = count_vectorizer.vocabulary_.keys()
word2idx = {word: idx for idx, word in enumerate(vocab)} # create word index

In [13]:
def longest_sequence(data):
    max_len = 0
    
    for seq in data:
        if len(seq) > max_len:
            max_len = len(seq)
    
    return max_len

def pad_with_max_length(max_len, data):
    res = np.zeros((len(data), max_len))
    for i, row in enumerate(data):
        for j, num in enumerate(row):
            res[i, j] = num
    return np.array(res, dtype=np.int64)

def convert_to_embedding_vocab(data):
    res = []
    for des in data:
        converted = []
        for word in des.split(" "):
            if word in word2idx:
                converted.append(word2idx[word])
        res.append(np.array(converted, dtype=np.int64))
        
    res = np.array(res)
    
    max_len = longest_sequence(res)
    
    return pad_with_max_length(max_len, res)

In [14]:
train_data = convert_to_embedding_vocab(train_data)
train_data = torch.from_numpy(train_data)
train_labels = torch.from_numpy(train_labels)

In [15]:
test_data = convert_to_embedding_vocab(test_data)
test_data = torch.from_numpy(test_data)
test_labels = torch.from_numpy(test_labels)

In [16]:
train_data = torch.utils.data.TensorDataset(train_data, train_labels)
test_data = torch.utils.data.TensorDataset(test_data, test_labels)

In [17]:
trainloader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)
testloader = torch.utils.data.DataLoader(test_data, batch_size=128, shuffle=True)

In [18]:
glove_embeddings = load_glove_embeddings(glove_path, word2idx)

In [29]:
class CNN(nn.Module):
    
    def __init__(self, embeddings, num_outputs, kernel_num, kernel_sizes, static):
        super(CNN, self).__init__()
        
        self.static = static
        
        V = embeddings.shape[0]
        D = embeddings.shape[1]
        C = num_outputs
        Ci = 1
        Co = kernel_num
        Ks = kernel_sizes

        self.embed = nn.Embedding(V, D, padding_idx=0)
        self.embed.weight = nn.Parameter(embeddings)
        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        
        if self.static:
            x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        return logit

In [37]:
import torch.optim as optim

model = CNN(embeddings=glove_embeddings, num_outputs=30, kernel_num=100, kernel_sizes=[3,4,5], static=False)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [38]:
def train(model, trainloader, testloader, optimizer, epochs):
    print("Start training")
    validate(model, testloader)
    
    for epoch in range(epochs):  # loop over the dataset multiple times
        print("Start epoch: " + str(epoch + 1))
        
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs, labels = Variable(inputs), Variable(labels)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.data[0]
            if i % 100 == 0:    # print every 100 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 100))
                running_loss = 0.0
                validate(model, testloader)

    print('Finished Training')

In [39]:
def validate(model, testloader):
    correct = 0
    total = 0
    for data in testloader:
        descriptions, labels = data
        outputs = model(Variable(descriptions))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy: %d %%' % (
        100 * correct / total))

In [40]:
train(model, trainloader, testloader, optimizer, 1)

Start training
Accuracy: 3 %
Start epoch: 1
[1,     1] loss: 0.035
Accuracy: 3 %
[1,   101] loss: 3.170
Accuracy: 11 %
[1,   201] loss: 3.097
Accuracy: 11 %
[1,   301] loss: 3.092
Accuracy: 12 %
[1,   401] loss: 3.058
Accuracy: 13 %
[1,   501] loss: 3.053
Accuracy: 13 %
[1,   601] loss: 3.044
Accuracy: 14 %
Finished Training


Accuracy: 20 %
