In [1]:
import numpy as np
import pandas as pd
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from torch import nn
from torch.autograd import Variable
import torch
import torch.utils.data
import torch.nn.functional as F
import torch.optim as optim

USE_CUDA = False

if torch.cuda.is_available():
    USE_CUDA = True

In [2]:
raw_data = pd.read_csv("data/winemag-data-130k-v2.csv")
raw_descriptions = raw_data['description']
raw_varieties = raw_data['variety']
raw_provinces = raw_data['province']
raw_points = raw_data['points']

In [3]:
valid_varieties = set(['pinot noir', 'chardonnay', 'cabernet sauvignon', 'red blend', 'bordeaux-style red blend', 'riesling', 'sauvignon blanc', 'syrah', 'rosé', 'merlot']) #, 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese red', 'white blend', 'sparkling blend', 'tempranillo', 'rhône-style red blend', 'pinot gris', 'champagne blend', 'cabernet franc', 'grüner veltliner', 'portuguese white', 'bordeaux-style white blend', 'pinot grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz'])
excluded_words = set(['pinot', 'noir', 'chardonnay', 'cabernet', 'sauvignon', 'bordeaux-style', 'blend', 'riesling', 'sauvignon',  'blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese', 'tempranillo', 'rhône-style', 'pinot', 'gris', 'champagne', 'franc', 'grüner',  'veltliner', 'portuguese', 'grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz', 'flavor', 'wine'])

label_to_idx = {word: idx for idx, word in enumerate(valid_varieties)}
print(label_to_idx)

{'sauvignon blanc': 0, 'rosé': 1, 'red blend': 2, 'cabernet sauvignon': 3, 'bordeaux-style red blend': 4, 'merlot': 5, 'chardonnay': 6, 'pinot noir': 7, 'syrah': 8, 'riesling': 9}


In [4]:
# Extract rows with just the valid varieties

def process_description(des):
    processed_description = []
    
    table = str.maketrans({key: None for key in string.punctuation})
    des = des.translate(table)
    
    for word in des.split():
        word = word.lower()
        if word not in excluded_words:
            processed_description.append(word)
            
    return " ".join(processed_description)

data, labels = [], []

for i, variety in enumerate(raw_varieties):
    if type(variety) is not float:
        variety = variety.lower()
        if variety in valid_varieties:
            if type(raw_descriptions[i]) is not float:                
                data.append(process_description(raw_descriptions[i]))
                labels.append(label_to_idx[variety])

print(len(data), len(labels))

71322 71322


In [5]:
# Print a sample of the data

print(data[:5])

['pineapple rind lemon pith and orange blossom start off the aromas the palate is a bit more opulent with notes of honeydrizzled guava and mango giving way to a slightly astringent semidry finish', 'much like the regular bottling from 2012 this comes across as rather rough and tannic with rustic earthy herbal characteristics nonetheless if you think of it as a pleasantly unfussy country its a good companion to a hearty winter stew', 'soft supple plum envelopes an oaky structure in this supported by 15 coffee and chocolate complete the picture finishing strong at the end resulting in a valuepriced of attractive and immediate accessibility', 'slightly reduced this offers a chalky tannic backbone to an otherwise juicy explosion of rich black cherry the whole accented throughout by firm oak and cigar box', 'building on 150 years and six generations of winemaking tradition the winery trends toward a leaner style with the classic california buttercream aroma cut by tart green apple in this g

In [6]:
# Split 80/20 training-test

stacked = np.hstack([np.array(data).reshape(-1, 1), np.array(labels).reshape(-1, 1)])
np.random.shuffle(stacked)

train_split = int(len(stacked) * 0.8)

train_data = stacked[:train_split, :1].reshape(-1,)
train_labels = np.array(stacked[:train_split, 1:].reshape(-1,), dtype=np.int32)

test_data = stacked[train_split:, :1].reshape(-1,)
test_labels = np.array(stacked[train_split:, 1:].reshape(-1,), dtype= np.int32)

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(57057,) (57057,)
(14265,) (14265,)


In [7]:
glove_path = 'data/glove/glove.6B.300d.txt'

In [8]:
def load_glove(path):
    """
    creates a dictionary mapping words to vectors from a file in glove format.
    """
    with open(path) as f:
        glove = {}
        for line in f.readlines():
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            glove[word] = vector
        return glove

In [9]:
def load_glove_embeddings(path, word2idx, embedding_dim=50):
    with open(path) as f:
        embeddings = np.zeros((len(word2idx), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = word2idx.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
        return torch.from_numpy(embeddings).float()

In [10]:
%time glove = load_glove(glove_path)

CPU times: user 24.6 s, sys: 880 ms, total: 25.5 s
Wall time: 25.5 s


In [11]:
count_vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))
count_vectorizer.fit(train_data)
vocab = count_vectorizer.vocabulary_.keys()
word2idx = {word: idx for idx, word in enumerate(vocab)} # create word index

In [12]:
def longest_sequence(data):
    max_len = 0
    
    for seq in data:
        if len(seq) > max_len:
            max_len = len(seq)
    
    return max_len

def pad_with_max_length(max_len, data):
    res = np.zeros((len(data), max_len))
    for i, row in enumerate(data):
        for j, num in enumerate(row):
            res[i, j] = num
    return np.array(res, dtype=np.int64)

def convert_to_embedding_vocab(data):
    res = []
    for des in data:
        converted = []
        for word in des.split(" "):
            if word in word2idx:
                converted.append(word2idx[word])
        res.append(np.array(converted, dtype=np.int64))
        
    res = np.array(res)
    
    max_len = longest_sequence(res)
    
    return pad_with_max_length(max_len, res)

In [13]:
train_data = convert_to_embedding_vocab(train_data)
train_data = torch.from_numpy(train_data)
train_labels = torch.from_numpy(train_labels)

In [14]:
test_data = convert_to_embedding_vocab(test_data)
test_data = torch.from_numpy(test_data)
test_labels = torch.from_numpy(test_labels)

In [15]:
train_data = torch.utils.data.TensorDataset(train_data, train_labels)
test_data = torch.utils.data.TensorDataset(test_data, test_labels)

In [16]:
trainloader = torch.utils.data.DataLoader(train_data, batch_size=50, shuffle=True)
testloader = torch.utils.data.DataLoader(test_data, batch_size=50, shuffle=True)

In [17]:
glove_embeddings = load_glove_embeddings(glove_path, word2idx, embedding_dim=300)

In [18]:
class CNN(nn.Module):
    
    def __init__(self, embeddings, num_outputs, kernel_num, kernel_sizes, static):
        super(CNN, self).__init__()
        
        self.static = static
        
        V = embeddings.shape[0]
        D = embeddings.shape[1]
        C = num_outputs
        Ci = 1
        Co = kernel_num
        Ks = kernel_sizes

        self.embed = nn.Embedding(V, D)
#         self.embed.weight = nn.Parameter(embeddings)
        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        
        if self.static:
            x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        return logit

In [19]:
def train(model, trainloader, testloader, optimizer, epochs=10, scheduler=None):
    print("Start training")
    model.train()
    
    for epoch in range(epochs):  # loop over the dataset multiple times
        print("Start epoch: " + str(epoch + 1))
        
        running_loss = 0.0
        total_batches = 0
        for i, data in enumerate(trainloader):
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs, labels = Variable(inputs), Variable(labels)
            
            if USE_CUDA:
                inputs, labels = inputs.cuda(), labels.cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.data[0]
            total_batches += 1

        print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 100))

        val_loss = validate(model, testloader)
        
        if scheduler:
            scheduler.step(val_loss)

    print('Finished Training')

In [20]:
def validate(model, testloader):
    correct = 0
    total = 0
    model.eval()
    
    for data in testloader:
        descriptions, labels = data
        
        if USE_CUDA:
            descriptions, labels = descriptions.cuda(), labels.cuda()
            
        outputs = model(Variable(descriptions))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy: %f%%' % (
        100 * float(correct) / total))
    
    return float(correct) / total

In [25]:
model = CNN(embeddings=glove_embeddings, num_outputs=len(label_to_idx), kernel_num=100, kernel_sizes=[3,4,5], static=False)

if USE_CUDA:
    model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, mode='min', verbose=True)

train(model, trainloader, testloader, optimizer=optimizer, scheduler=scheduler, epochs=120)

Start training
Start epoch: 1
[1,  1142] loss: 20.702
Accuracy: 55.646688%
Start epoch: 2
[2,  1142] loss: 12.351
Accuracy: 53.753943%
Start epoch: 3
[3,  1142] loss: 9.693
Accuracy: 59.298984%
Start epoch: 4
[4,  1142] loss: 8.988
Accuracy: 54.293726%
Start epoch: 5
[5,  1142] loss: 11.395
Accuracy: 46.147914%
Start epoch: 6
[6,  1142] loss: 14.318
Accuracy: 56.459867%
Start epoch: 7
[7,  1142] loss: 16.294
Accuracy: 59.880827%
Start epoch: 8
[8,  1142] loss: 19.933
Accuracy: 59.011567%
Start epoch: 9
[9,  1142] loss: 22.409
Accuracy: 62.467578%
Start epoch: 10
[10,  1142] loss: 22.400
Accuracy: 60.708027%
Start epoch: 11
[11,  1142] loss: 23.710
Accuracy: 57.539432%
Start epoch: 12
[12,  1142] loss: 25.128
Accuracy: 60.161234%
Start epoch: 13
[13,  1142] loss: 26.242
Accuracy: 59.284963%
Start epoch: 14
[14,  1142] loss: 24.744
Accuracy: 61.738521%
Start epoch: 15
[15,  1142] loss: 21.975
Accuracy: 62.509639%
Start epoch: 16
[16,  1142] loss: 20.392
Accuracy: 63.841570%
Epoch    15: 

KeyboardInterrupt: 

In [26]:
validate(model, testloader)

Accuracy: 65.348756%


0.6534875569575885

In [23]:
class LSTMClassifier(nn.Module):

    def __init__(self, embeddings, hidden_dim, label_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        self.lstm = nn.LSTM(embeddings.shape[1], hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # the first is the hidden h
        # the second is the cell  c
        return (torch.autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                torch.autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        x = embeds.view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y  = self.hidden2label(lstm_out[-1])
        log_probs = F.log_softmax(y)
        return log_probs

In [24]:
lstm_model = LSTMClassifier(glove_embeddings, 100, 20)

if USE_CUDA:
    lstm_model = lstm_model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(lstm_model.parameters(), lr=0.01, momentum=0.9)

# train(model, trainloader, testloader, optimizer=optimizer, scheduler=None, epochs=50)