In [479]:
import numpy as np
import pandas as pd
import string
import nltk
import random

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTE

from torch import nn
from torch.autograd import Variable
import torch
import torch.utils.data
import torch.nn.functional as F
import torch.optim as optim

USE_CUDA = False

if torch.cuda.is_available():
    USE_CUDA = True

print(USE_CUDA)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/paperspace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/paperspace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
True


In [480]:
def load_glove_embeddings(path, word2idx, embedding_dim=100):
    with open(path) as f:
        embeddings = np.zeros((len(word2idx), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = word2idx.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
        return torch.from_numpy(embeddings).float()

In [481]:
def load_glove(path):
    """
    creates a dictionary mapping words to vectors from a file in glove format.
    """
    with open(path) as f:
        glove = {}
        for line in f.readlines():
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            glove[word] = vector
        return glove

In [482]:
# embedding_path = 'data/glove/glove.6B.300d.txt'
embedding_path = 'data/GoogleNews-vectors-negative300.txt'

In [483]:
# %time glove = load_glove(embedding_path)

In [484]:
raw_data = pd.read_csv("data/winemag-data-130k-v2.csv")
raw_descriptions = raw_data['description']
raw_varieties = raw_data['variety']
raw_provinces = raw_data['province']
raw_points = raw_data['points']

In [485]:
valid_varieties = set(['pinot noir', 'chardonnay', 'cabernet sauvignon', 'riesling', 'sauvignon blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec']) #, 'portuguese red', 'white blend', 'sparkling blend', 'tempranillo', 'rhône-style red blend', 'pinot gris', 'champagne blend', 'cabernet franc', 'grüner veltliner', 'portuguese white', 'bordeaux-style white blend', 'pinot grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz'])
excluded_words = set(['pinot', 'noir', 'chardonnay', 'cabernet', 'sauvignon', 'bordeaux-style', 'blend', 'riesling', 'sauvignon',  'blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec', 'portuguese', 'tempranillo', 'rhône-style', 'pinot', 'gris', 'champagne', 'franc', 'grüner',  'veltliner', 'portuguese', 'grigio', 'gamay', 'gewürztraminer', 'viognier', 'shiraz', 'flavor', 'wine'])

varieties = ['pinot noir', 'chardonnay', 'cabernet sauvignon', 'riesling', 'sauvignon blanc', 'syrah', 'rosé', 'merlot', 'nebbiolo', 'zinfandel', 'sangiovese', 'malbec']
label_to_idx = {word: idx for idx, word in enumerate(varieties)}
print(label_to_idx)

{'pinot noir': 0, 'chardonnay': 1, 'cabernet sauvignon': 2, 'riesling': 3, 'sauvignon blanc': 4, 'syrah': 5, 'rosé': 6, 'merlot': 7, 'nebbiolo': 8, 'zinfandel': 9, 'sangiovese': 10, 'malbec': 11}


In [486]:
# Extract rows with just the valid varieties

def process_description(des):
    processed_description = []
    
    table = str.maketrans({key: None for key in string.punctuation})
    des = des.translate(table)
    
    for word in des.split():
        word = word.lower()
        if word not in excluded_words:
            processed_description.append(word)
            
    return processed_description

data, labels = [], []

for i, variety in enumerate(raw_varieties):
    if type(variety) is not float:
        variety = variety.lower()
        if variety in valid_varieties:
            if type(raw_descriptions[i]) is not float:                
                data.append(process_description(raw_descriptions[i]))
#                 if variety == 'bordeaux-style red blend' or variety == 'rhône-style red blend':
#                     variety = 'red blend'
                labels.append(label_to_idx[variety])

print(len(data), len(labels))
print(Counter(labels).most_common())

66338 66338
[(0, 13272), (1, 11753), (2, 9472), (3, 5189), (4, 4967), (5, 4142), (6, 3564), (7, 3102), (8, 2804), (9, 2714), (10, 2707), (11, 2652)]


In [487]:
# Print a sample of the data

print(data[:5])

[['pineapple', 'rind', 'lemon', 'pith', 'and', 'orange', 'blossom', 'start', 'off', 'the', 'aromas', 'the', 'palate', 'is', 'a', 'bit', 'more', 'opulent', 'with', 'notes', 'of', 'honeydrizzled', 'guava', 'and', 'mango', 'giving', 'way', 'to', 'a', 'slightly', 'astringent', 'semidry', 'finish'], ['much', 'like', 'the', 'regular', 'bottling', 'from', '2012', 'this', 'comes', 'across', 'as', 'rather', 'rough', 'and', 'tannic', 'with', 'rustic', 'earthy', 'herbal', 'characteristics', 'nonetheless', 'if', 'you', 'think', 'of', 'it', 'as', 'a', 'pleasantly', 'unfussy', 'country', 'its', 'a', 'good', 'companion', 'to', 'a', 'hearty', 'winter', 'stew'], ['soft', 'supple', 'plum', 'envelopes', 'an', 'oaky', 'structure', 'in', 'this', 'supported', 'by', '15', 'coffee', 'and', 'chocolate', 'complete', 'the', 'picture', 'finishing', 'strong', 'at', 'the', 'end', 'resulting', 'in', 'a', 'valuepriced', 'of', 'attractive', 'and', 'immediate', 'accessibility'], ['slightly', 'reduced', 'this', 'offers'

In [488]:
# Split 80/20 training-test

stacked = np.hstack([np.array(data).reshape(-1, 1), np.array(labels).reshape(-1, 1)])
np.random.shuffle(stacked)

train_split = int(len(stacked) * 0.8)

train_data = stacked[:train_split, :1].reshape(-1,)
train_labels = np.array(stacked[:train_split, 1:].reshape(-1,), dtype=np.int32)

test_data = stacked[train_split:, :1].reshape(-1,)
test_labels = np.array(stacked[train_split:, 1:].reshape(-1,), dtype= np.int32)

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(53070,) (53070,)
(13268,) (13268,)


In [489]:
from nltk.corpus import wordnet as wn

POS = {
    'v': 'verb', 'a': 'adjective', 's': 'satellite adjective', 
    'n': 'noun', 'r': 'adverb'}

def synonym(word, pos):
    syns = word
    synset = None
    if pos == 'NN':
        synset = wn.synsets(word, 'n')
    elif pos == 'VB':
        synset = wn.synsets(word, 'v')
    elif pos == 'JJ':
        synset = wn.synsets(word, 'a')
        
    if synset and len(synset) > 0:
        synset = synset[0]
        syns = [n.replace('_', ' ') for n in synset.lemma_names()]
        
        if word in syns:
            syns.remove(word)

        if len(syns) > 0:
            return random.choice(syns).split(' ')
        
    return [word]

def augment_data(data, labels, labels_to_augment):
    new_data = []
    new_labels = []
    
    for i, des in enumerate(data):
        if labels[i] in labels_to_augment:
            pos = nltk.pos_tag(des)
            new_des = []

            # Replace every possible word in the description with a synonym, if possible
            for j in range(0, len(des)):
                new_des += synonym(des[j], pos[j][1])
            
            new_data.append(new_des)
            new_labels.append(labels[i])
    
    return new_data, new_labels

In [490]:
# synonym_data, synonym_labels = augment_data(train_data, train_labels, range(3, 12))

In [491]:
# train_data = np.concatenate((train_data, synonym_data), axis=0)
# train_labels = np.concatenate((train_labels, synonym_labels), axis=0)

# train_data.shape, train_labels.shape

In [492]:
count_vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False, stop_words='english', token_pattern='[a-z]+', ngram_range=(1, 1))
count_vectorizer.fit(train_data)
vocab = count_vectorizer.vocabulary_.keys()
word2idx = {word: idx for idx, word in enumerate(vocab)} # create word index

In [493]:
def longest_sequence(data):
    max_len = 0
    
    for seq in data:
        if len(seq) > max_len:
            max_len = len(seq)
    
    return max_len

def pad_with_max_length(max_len, data):
    res = np.zeros((len(data), max_len))
    for i, row in enumerate(data):
        for j, num in enumerate(row):
            res[i, j] = num
    return np.array(res, dtype=np.int64)

def convert_to_embedding_vocab(data):
    res = []
    for des in data:
        converted = []
        for word in des:
            if word in word2idx:
                converted.append(word2idx[word])
                
        res.append(np.array(converted, dtype=np.int64))
        
    res = np.array(res)
    
    max_len = longest_sequence(res)
    
    return pad_with_max_length(max_len, res)

In [494]:
train_data_embedded = convert_to_embedding_vocab(train_data)
train_data = train_data_embedded

In [495]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)

train_data_resampled, train_labels_resampled = ros.fit_sample(train_data_embedded, train_labels)
train_data = train_data_resampled.astype('int64')
train_labels = train_labels_resampled

In [496]:
train_data = torch.from_numpy(train_data)
train_labels = torch.from_numpy(train_labels)

In [497]:
test_data = convert_to_embedding_vocab(test_data)
test_data = torch.from_numpy(test_data)
test_labels = torch.from_numpy(test_labels)

In [498]:
train_data = torch.utils.data.TensorDataset(train_data, train_labels)
test_data = torch.utils.data.TensorDataset(test_data, test_labels)

In [499]:
trainloader = torch.utils.data.DataLoader(train_data, batch_size=100, shuffle=True)
testloader = torch.utils.data.DataLoader(test_data, batch_size=100, shuffle=True)

In [500]:
glove_embeddings = load_glove_embeddings(embedding_path, word2idx, embedding_dim=300)

In [501]:
class CNN(nn.Module):
    
    def __init__(self, embeddings, num_outputs, kernel_num, kernel_sizes, static):
        super(CNN, self).__init__()
        
        self.static = static
        
        V = embeddings.shape[0]
        D = embeddings.shape[1]
        C = num_outputs
        Ci = 1
        Co = kernel_num
        Ks = kernel_sizes

        self.embed = nn.Embedding(V, D)
        self.embed.weight = nn.Parameter(embeddings)
        
        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        
        if self.static:
            x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        return logit

In [502]:
def train(model, trainloader, testloader, optimizer, epochs=10, scheduler=None):
    print("Start training")
    model.train()
    
    for epoch in range(epochs):  # loop over the dataset multiple times
        print("Start epoch: " + str(epoch + 1))
        
        running_loss = 0.0
        total_batches = 0
        for i, data in enumerate(trainloader):
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs, labels = Variable(inputs), Variable(labels)
            
            if USE_CUDA:
                inputs, labels = inputs.cuda(), labels.cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.data[0]
            total_batches += 1

        print('[%d, %5d] loss: %.7f' %
                      (epoch + 1, i + 1, running_loss / total_batches))

        val_loss = validate(model, testloader)
        
        if scheduler:
            scheduler.step(val_loss)

    print('Finished Training')

In [503]:
def validate(model, testloader):
    correct = 0
    total = 0
    model.eval()
    
    for data in testloader:
        descriptions, labels = data
        
        if USE_CUDA:
            descriptions, labels = descriptions.cuda(), labels.cuda()
            
        outputs = model(Variable(descriptions))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy: %f%%' % (
        100 * float(correct) / total))
    
    return float(correct) / total

In [None]:
class_weights = np.zeros(len(label_to_idx))
class_counter = Counter(train_labels).most_common()
print(class_counter)

for label, ct in class_counter:
    class_weights[label] = float(ct) / class_counter[0][1]
    
class_weights = torch.from_numpy(class_weights.astype(np.float32))

if USE_CUDA:
    class_weights = class_weights.cuda()

[(5, 10649), (1, 10649), (11, 10649), (0, 10649), (6, 10649), (3, 10649), (7, 10649), (2, 10649), (9, 10649), (4, 10649), (10, 10649), (8, 10649)]


In [None]:
model = CNN(embeddings=glove_embeddings, num_outputs=len(label_to_idx), kernel_num=100, kernel_sizes=[3,4,5], static=False)

if USE_CUDA:
    model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, mode='min', verbose=True)

train(model, trainloader, testloader, optimizer=optimizer, scheduler=scheduler, epochs=150)

Start training
Start epoch: 1
[1,  1278] loss: 1.2774162
Accuracy: 61.817908%
Start epoch: 2
[2,  1278] loss: 0.4519409
Accuracy: 69.264396%
Start epoch: 3
[3,  1278] loss: 0.1504534
Accuracy: 70.711486%
Start epoch: 4
[4,  1278] loss: 0.0449013
Accuracy: 70.560748%
Start epoch: 5
[5,  1278] loss: 0.0109693
Accuracy: 71.533012%
Start epoch: 6
[6,  1278] loss: 0.0035853
Accuracy: 71.826952%
Start epoch: 7
[7,  1278] loss: 0.0028958
Accuracy: 71.774194%
Start epoch: 8
[8,  1278] loss: 0.0026631
Accuracy: 71.932469%
Start epoch: 9
[9,  1278] loss: 0.0019512
Accuracy: 71.766657%
Start epoch: 10
[10,  1278] loss: 0.0021344
Accuracy: 71.713898%
Start epoch: 11
[11,  1278] loss: 0.0019246
Accuracy: 71.646066%
Start epoch: 12
[12,  1278] loss: 0.0015050
Accuracy: 71.872174%
Epoch    11: reducing learning rate of group 0 to 1.0000e-02.
Start epoch: 13
[13,  1278] loss: 0.0017351
Accuracy: 71.751583%
Start epoch: 14
[14,  1278] loss: 0.0009091
Accuracy: 71.857100%
Start epoch: 15
[15,  1278] los

In [None]:
torch.save(model, 'models/cnn_google_dropout_0.5_oversample.pth')

In [None]:
Counter(train_labels)