In [1]:
dataType = 'beauty'
dataCol = 'Benefits'
#Beauty ['Benefits', 'Brand', 'Colour_group', 'Product_texture', 'Skin_type']
#Fashion ['Pattern', 'Collar Type', 'Fashion Trend', 'Clothing Material', 'Sleeves']

MAIN_DIR=''

In [2]:
#Define data files
json_mobile=MAIN_DIR+'data_full/mobile_profile_train.json'
json_fashion=MAIN_DIR+'data_full/fashion_profile_train.json'
json_beauty=MAIN_DIR+'data_full/beauty_profile_train.json'
train_mobile=MAIN_DIR+'data_full/mobile_data_info_train_competition.csv'
train_fashion=MAIN_DIR+'data_full/fashion_data_info_train_competition.csv'
train_beauty=MAIN_DIR+'data_full/beauty_data_info_train_competition.csv'
val_mobile=MAIN_DIR+'data_full/mobile_data_info_val_competition.csv'
val_fashion=MAIN_DIR+'data_full/fashion_data_info_val_competition.csv'
val_beauty=MAIN_DIR+'data_full/beauty_data_info_val_competition.csv'

files_dict={'mobile':[json_mobile,train_mobile,val_mobile],'fashion':[json_fashion,train_fashion,val_fashion],'beauty':[json_beauty,train_beauty,val_beauty]}
#files = [json_fashion,train_fashion,val_fashion]
#files = [json_mobile,train_mobile,val_mobile]
#files = [json_beauty,train_beauty,val_beauty]

# Helper functions

In [3]:
import json

def write_json(data,filename):
    print("")
    
def read_json(filename):
    if filename:
        with open(filename, 'r') as f:
            datastore = json.load(f)
    
    return datastore

In [4]:
import os
import pickle
import torch
import pandas as pd


SPECIAL_WORDS = {'PADDING': '<PAD>'}


def load_data(path):
    """
    Load Dataset from File
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data


def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables):
    """
    Preprocess Text Data
    """
    text = load_data(dataset_path)
    
    # Ignore notice, since we don't use it for analysing the data
    #text = text[81:]

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))


def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('preprocess.p', mode='rb'))


def save_model(filename, decoder):
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    torch.save(decoder, save_filename)


def load_model(filename):
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    return torch.load(save_filename,map_location=lambda storage, loc: storage)


# Get data

In [5]:
import pandas as pd
df=pd.read_csv(files_dict[dataType][1])
df.head()

Unnamed: 0,itemid,title,image_path,Benefits,Brand,Colour_group,Product_texture,Skin_type
0,307504,nyx sex bomb pallete natural palette,beauty_image/6b2e9cbb279ac95703348368aa65da09.jpg,1.0,157.0,,,
1,461203,etude house precious mineral any cushion pearl...,beauty_image/20450222d857c9571ba8fa23bdedc8c9.jpg,,73.0,11.0,7.0,
2,3592295,milani rose powder blush,beauty_image/6a5962bed605a3dd6604ca3a4278a4f9.jpg,,393.0,20.0,6.0,
3,4460167,etude house baby sweet sugar powder,beauty_image/56987ae186e8a8e71fcc5a261ca485da.jpg,,73.0,,6.0,
4,5853995,bedak revlon color stay aqua mineral make up,beauty_image/9c6968066ebab57588c2f757a240d8b9.jpg,3.0,47.0,,6.0,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286583 entries, 0 to 286582
Data columns (total 8 columns):
itemid             286583 non-null int64
title              286583 non-null object
image_path         286583 non-null object
Benefits           113556 non-null float64
Brand              238128 non-null float64
Colour_group       121324 non-null float64
Product_texture    244295 non-null float64
Skin_type          58410 non-null float64
dtypes: float64(5), int64(1), object(2)
memory usage: 17.5+ MB


In [7]:
df2 = df[['itemid','title',dataCol]]
df2=df2.dropna()
df2=df2.reset_index(drop=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113556 entries, 0 to 113555
Data columns (total 3 columns):
itemid      113556 non-null int64
title       113556 non-null object
Benefits    113556 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.6+ MB


In [8]:
df2.head(10)

Unnamed: 0,itemid,title,Benefits
0,307504,nyx sex bomb pallete natural palette,1.0
1,5853995,bedak revlon color stay aqua mineral make up,3.0
2,6208490,dr pure whitening cream,6.0
3,9184082,sunprise all proof spf 50,6.0
4,11425757,milani rose powder blush tea,1.0
5,14954162,giordani gold age defying compact foundation d...,6.0
6,15584552,the body shop refill moisture white perfect fo...,6.0
7,16397989,lancome blush subtil long lasting powder blush...,1.0
8,19764106,cream dr biru original,6.0
9,22836092,make over cleansing cream,3.0


## Set classes

In [9]:
categories = read_json(files_dict[dataType][0])
#print(categories.keys())
#print(categories.values())
classes=list(categories[dataCol].keys())
print(classes)

['high pigmentation', 'natural', 'light', 'hydrating', 'durable', 'oil control', 'spf']


# Pre-process text

## Create Dictionary

### Create text file

In [10]:
df.title.head()

0                 nyx sex bomb pallete natural palette
1    etude house precious mineral any cushion pearl...
2                             milani rose powder blush
3                  etude house baby sweet sugar powder
4         bedak revlon color stay aqua mineral make up
Name: title, dtype: object

In [11]:
df['title'].to_csv(dataType+'_title.txt',header=None,index=None, sep=' ', mode='a')

### Create lookup table

In [10]:
from string import punctuation
from collections import Counter

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    # TODO: Implement Function

    counts = Counter(text)
    vocab = sorted(counts, key=counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    
    # return tuple
    return (vocab_to_int, int_to_vocab)

In [11]:
import re

def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    dict = {
        #"." : "<PERIOD>",
        "," : "<COMMA>",
        '"' : "<QUOTATION_MARK>",
        ";" : "<SEMICOLON>",
        "!" : "<EXCLAMATION_MARK>",
        "?" : "<QUESTION_MARK>",
        "(" : "<LEFT_PAREN>",
        ")" : "<RIGHT_PAREN>",
        "-" : "<DASH>",
        "\n" : "<RETURN>"
    }

    return dict

In [12]:
data_dir = MAIN_DIR+'data_full/beauty_title.txt'

# pre-process training data
preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

## Checkpoint

In [13]:
int_text, vocab_to_int, int_to_vocab, token_dict = load_preprocess()

In [14]:
len(vocab_to_int)

30977

## Vectorize title

In [15]:
#Set equal length (around 3/4 between avg length and max length)
pad = 20

In [16]:
#Number of words in title
max_words = 0
min_words = 100
num_words = 0
for sentence in df2.title:
    temp = len(sentence.split())
    if temp > max_words:
        max_words = temp
    if temp < min_words:
        min_words = temp
    num_words += temp
avg = num_words/len(df2.title)
print("max",max_words,"min",min_words,"avg",avg)

max 29 min 1 avg 8.121754905068865


In [17]:
import numpy as np
arr=[]
for sentence in df2.title:
    words = sentence.split()
    #print(sentence)
    temp= [vocab_to_int[word] for word in words]
    if pad > len(temp):
        padding = np.zeros(pad-len(temp)).astype(int)
        temp.extend(padding)
    arr.extend(temp[:pad])
    #print(len(arr))
arr[:10]

[44, 8382, 3002, 541, 14, 121, 0, 0, 0, 0]

In [18]:
len(arr)

2271120

In [19]:
len(arr) == pad*len(df2.title)

True

# Dataset and Batching

In [20]:
import torch
from torch.utils.data import Dataset

class Dataset(Dataset):

    def __init__(self, dataframe, arr, pad):
        
        self.itemid = dataframe.iloc[:,0]
        self.labels = dataframe.iloc[:,-1]
        self.data = arr
        self.pad = pad        

    def __len__(self):
        #'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, idx):
        'Generates one sample of data'
        X = self.data[idx]
        y = self.labels[idx]
        itemid = self.itemid[idx]

        return X, y, itemid

dataset = Dataset(df2, arr, pad)

In [21]:
len(dataset)

2271120

In [22]:
batch_size = 50

In [23]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size,target):
    """
    Batch the neural network data using DataLoader
    :param words: The word ids of the TV scripts
    :param sequence_length: The sequence length of each batch
    :param batch_size: The size of each batch; the number of sequences in a batch
    :return: DataLoader with batched data
    """
    # TODO: Implement function
    n_batches = len(words)//(batch_size*sequence_length)
    words = words[:n_batches*(batch_size*sequence_length)] #cut short words to ensure full batches only
    feature_tensors, target_tensors = [], []
    
    print("No. of batches: ",n_batches)
    print("Batch size: ",batch_size)
    
    i=0
    for idx in range(0, len(words), pad):     
        batch_x = words[idx:idx+sequence_length]
        #batch_y = words[idx+sequence_length]
        feature_tensors.append(batch_x)
        target_tensors.append(target[i])
        i+=1
    
    feature_tensors = np.asarray(feature_tensors)
    target_tensors = np.asarray(target_tensors)
    print(feature_tensors.shape)
    print(target_tensors.shape)
    
    data = TensorDataset(torch.LongTensor(feature_tensors), torch.LongTensor(target_tensors))
    #data_loader = DataLoader(data, batch_size=batch_size,shuffle=True)
    
    # return a dataloader
    return data

In [24]:
target = df2.iloc[:,-1]
dataset = batch_data(arr, pad, batch_size,target)

No. of batches:  2271
Batch size:  50
(113550, 20)
(113550,)


In [25]:
len(target)

113556

# DataLoader

https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel

In [26]:
validation_split = 0.2
shuffle_dataset = True
random_seed= 42
num_workers = 0

from torch.utils.data.sampler import SubsetRandomSampler

# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, test_indices = indices[split:], indices[:split]
train_indices = train_indices[:len(train_indices)-len(train_indices)%batch_size]
test_indices = test_indices[:len(test_indices)-len(test_indices)%batch_size]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,num_workers=num_workers, sampler=test_sampler)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)

In [27]:
len(test_indices)

22700

In [28]:
print(len(train_loader),len(test_loader),len(data_loader))

1816 454 2271


In [29]:
len(dataset)/batch_size*validation_split

454.20000000000005

In [30]:
#test that it is loaded correctly
dataiter = iter(test_loader)
X,y= dataiter.next()
X

tensor([[   15,    22,   124,    33,     4,     2,   270,    10,    56,    25,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  232,   661,    29,   435,   122,   862,    13,   149,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  380,  1131,  1096,     3,  2348,    10,  1854,   691,  1231,   819,
           823,   205,    80,   178,     0,     0,     0,     0,     0,     0],
        [   70,    42,    11,   175,     4,     2,    10,    79,   358,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [   11,   177,     2,    10,    79,   137,   131,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [   44,   196,     8,   412,   494,   502,     3,     9,     7,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  565,   294,     6,    74,    76,   

# Model

In [31]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.3):
        """
        Initialize the PyTorch RNN Module
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(RNN, self).__init__()
        # TODO: Implement function
        
        # set class variables
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # define model layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
    
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """
        # TODO: Implement function   
        #print('nn_input',nn_input)
        batch_size = nn_input.size(0)
        embeds = self.embedding(nn_input)
        #print('hidden',hidden)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function - cross entropy loss used, maybe dont need sigmoid function
        # sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = out.view(batch_size, -1, self.output_size)
        sig_out = sig_out[:, -1] # get last batch of labels
      
        # return one batch of output word scores and the hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        # Implement function
        
        # initialize hidden state with zero weights, and move to GPU if available
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    """
    Forward and backward propagation on the neural network
    :param decoder: The PyTorch Module that holds the neural network
    :param decoder_optimizer: The PyTorch optimizer for the neural network
    :param criterion: The PyTorch loss function
    :param inp: A batch of input to the neural network
    :param target: The target output for the batch of input
    :return: The loss and the latest hidden state Tensor
    """
    
    # TODO: Implement Function
    
    # move data to GPU, if available
    if(train_on_gpu):
        rnn.cuda()
        inp, target = inp.cuda(), target.cuda()

    rnn.train()
    
    # perform backpropagation and optimization
    rnn.zero_grad()
    
    h = tuple([each.data for each in hidden])
    
    #print('inp ',inp.shape)
    #print('h ',h)
    output, h = rnn(inp, h)
    
    loss = criterion(output, target)
    #loss = criterion(output.squeeze(), target)
    #print("output",output.size())
    #print("target",target.size())
    loss.backward()
    
    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    
    optimizer.step()    
    
    # return the loss over a batch and the hidden state produced by our model
    return loss.item(), h


# Train

In [32]:
import torch

# Check for a GPU
#train_on_gpu=False
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

In [33]:
num_epochs = 5
learning_rate = 0.001
vocab_size = len(vocab_to_int)
output_size = len(classes)
embedding_dim = len(vocab_to_int)//10
hidden_dim = 256
n_layers = 3

# Show stats for every n number of batches
show_every_n_batches = 500

In [34]:
len(train_loader.dataset)

113550

In [35]:
import time

"""
DON'T MODIFY ANYTHING IN THIS CELL
"""

def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        start = time.time()
        for batch_i, (inputs, labels) in enumerate(train_loader):
            
            # make sure you iterate over completely full batches, only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Batch: {}  Loss: {}'.format(
                    epoch_i, n_epochs, batch_i, np.average(batch_losses)))
                batch_losses = []
            
        end = time.time()
        print("Batch: ",batch_i, "Time taken:", end - start)
            
    # returns a trained rnn
    return rnn

In [44]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""

# create model and move to gpu if available
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)

if train_on_gpu:
    print('training on gpu')
    rnn.cuda()
    #rnn.cpu()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
save_model('./save/trained_rnn_'+dataType, trained_rnn)
print('Model Trained and Saved')

training on gpu
Training for 5 epoch(s)...
Epoch:    1/5     Batch: 0  Loss: 1.945650577545166
Epoch:    1/5     Batch: 200  Loss: 1.1336316707730294
Epoch:    1/5     Batch: 400  Loss: 0.699049841761589
Epoch:    1/5     Batch: 600  Loss: 0.5819503055512905
Epoch:    1/5     Batch: 800  Loss: 0.5377893857657909
Epoch:    1/5     Batch: 1000  Loss: 0.4789429884403944
Epoch:    1/5     Batch: 1200  Loss: 0.4810584168881178
Epoch:    1/5     Batch: 1400  Loss: 0.4440986513346434
Epoch:    1/5     Batch: 1600  Loss: 0.437048032656312
Epoch:    1/5     Batch: 1800  Loss: 0.4450546015053988
Batch:  1815 Time taken: 125.61521244049072
Epoch:    2/5     Batch: 0  Loss: 0.43425910733640194
Epoch:    2/5     Batch: 200  Loss: 0.36346681989729407
Epoch:    2/5     Batch: 400  Loss: 0.3696157050132751
Epoch:    2/5     Batch: 600  Loss: 0.3851727339625359
Epoch:    2/5     Batch: 800  Loss: 0.37698832374066116
Epoch:    2/5     Batch: 1000  Loss: 0.38326532069593666
Epoch:    2/5     Batch: 1200 

  "type " + obj.__name__ + ". It won't be checked "


Model Trained and Saved


## Checkpoint

In [36]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import torch
import os

_, vocab_to_int, int_to_vocab, token_dict = load_preprocess()

filename = './save/trained_rnn_'+dataType
save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'

rnn = torch.load(save_filename, map_location=lambda storage, loc: storage)
criterion = nn.CrossEntropyLoss()

#helper.load_model('./save/trained_rnn2')

# Test

In [37]:
#torch.cuda.empty_cache()
show_every=10

In [44]:
import time
import torch.nn.functional as F

"""
DON'T MODIFY ANYTHING IN THIS CELL
"""

def test_rnn(test_loader,rnn,criterion,batch_size):

    if train_on_gpu:
        print('training on gpu')
        rnn.cuda()

    rnn.eval()
    results=[]
    correct=[]
    end = time.time()
    #start = time.time()
    for batch_i, (inputs, labels) in enumerate(test_loader):
        
        total_batches = len(test_loader)
        
        hidden = rnn.init_hidden(batch_size)
        output,_ = rnn(inputs,hidden)
        loss = criterion(output, labels)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        #_, indices = p.max(1)
        
        top_k = 3
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        temp = top_i[:,0]==y.numpy().squeeze()
        
        if batch_i == 0:
            results = top_i
            correct = temp
        else:
            results = np.vstack((results,top_i))
            correct = np.vstack((correct,temp))
        
        acc = correct.sum()/len(correct)/batch_size*100
        
        if batch_i % show_every == 0:
                print('Batch: {:>4}/{:<4}  Batch: {}  Accuracy: {}'.format(
                    batch_i, total_batches, batch_i, acc))
        
    
    # returns a trained rnn
    return {'results': results, 'num_correct': correct, 'loss': loss, 'accuracy': acc}

In [42]:
len(test_loader)

454

In [45]:
#test that the function is working
dataiter = iter(test_loader)
X,y= dataiter.next()
hidden = rnn.init_hidden(batch_size)
output,_=rnn(X,hidden)
p = F.softmax(output, dim=1).data
_, top_i = p.topk(3)
top_i = top_i.numpy().squeeze()
#print(top_i,labels)
top_i[0:5,0]==y.numpy().squeeze()[0:5]

RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cpu and hidden tensor at cuda:0

In [None]:
test = test_rnn(test_loader,rnn,criterion,batch_size)

In [168]:
#accuracy
test['num_correct'].sum()/len(correct)/batch_size*100

24.110132158590307

In [182]:
test['results'][:,1]==

array([[4, 1, 3],
       [6, 1, 3],
       [4, 3, 1],
       ...,
       [4, 1, 3],
       [3, 4, 1],
       [4, 1, 3]], dtype=int64)

# Results

    num_epochs = 5
    learning_rate = 0.001
    vocab_size = len(vocab_to_int)
    output_size = len(classes)
    embedding_dim = len(vocab_to_int)//10
    hidden_dim = 256
    n_layers = 3
    
    Training for 5 epoch(s)...
    Epoch:    1/5     Batch: 0  Loss: 1.945650577545166
    Epoch:    1/5     Batch: 200  Loss: 1.1336316707730294
    Epoch:    1/5     Batch: 400  Loss: 0.699049841761589
    Epoch:    1/5     Batch: 600  Loss: 0.5819503055512905
    Epoch:    1/5     Batch: 800  Loss: 0.5377893857657909
    Epoch:    1/5     Batch: 1000  Loss: 0.4789429884403944
    Epoch:    1/5     Batch: 1200  Loss: 0.4810584168881178
    Epoch:    1/5     Batch: 1400  Loss: 0.4440986513346434
    Epoch:    1/5     Batch: 1600  Loss: 0.437048032656312
    Epoch:    1/5     Batch: 1800  Loss: 0.4450546015053988
    Batch:  1815 Time taken: 125.61521244049072
    Epoch:    2/5     Batch: 0  Loss: 0.43425910733640194
    Epoch:    2/5     Batch: 200  Loss: 0.36346681989729407
    Epoch:    2/5     Batch: 400  Loss: 0.3696157050132751
    Epoch:    2/5     Batch: 600  Loss: 0.3851727339625359
    Epoch:    2/5     Batch: 800  Loss: 0.37698832374066116
    Epoch:    2/5     Batch: 1000  Loss: 0.38326532069593666
    Epoch:    2/5     Batch: 1200  Loss: 0.3759563161060214
    Epoch:    2/5     Batch: 1400  Loss: 0.36842311747372153
    Epoch:    2/5     Batch: 1600  Loss: 0.3625161216035485
    Epoch:    2/5     Batch: 1800  Loss: 0.3810826962813735
    Batch:  1815 Time taken: 124.71759033203125
    Epoch:    3/5     Batch: 0  Loss: 0.37221965892240405
    Epoch:    3/5     Batch: 200  Loss: 0.30008154425770045
    Epoch:    3/5     Batch: 400  Loss: 0.3219192175939679
    Epoch:    3/5     Batch: 600  Loss: 0.32561181324534116
    Epoch:    3/5     Batch: 800  Loss: 0.32122907355427743
    Epoch:    3/5     Batch: 1000  Loss: 0.3211130641773343
    Epoch:    3/5     Batch: 1200  Loss: 0.3305688297003508
    Epoch:    3/5     Batch: 1400  Loss: 0.3334200155735016
    Epoch:    3/5     Batch: 1600  Loss: 0.3218043975159526
    Epoch:    3/5     Batch: 1800  Loss: 0.33003114476799966
    Batch:  1815 Time taken: 126.0070869922638
    Epoch:    4/5     Batch: 0  Loss: 0.31227382412180305
    Epoch:    4/5     Batch: 200  Loss: 0.285961125921458
    Epoch:    4/5     Batch: 400  Loss: 0.2930843724682927
    Epoch:    4/5     Batch: 600  Loss: 0.29513030499219894
    Epoch:    4/5     Batch: 800  Loss: 0.2962474745884538
    Epoch:    4/5     Batch: 1000  Loss: 0.2932254296541214
    Epoch:    4/5     Batch: 1200  Loss: 0.3029736075922847
    Epoch:    4/5     Batch: 1400  Loss: 0.29722723722457883
    Epoch:    4/5     Batch: 1600  Loss: 0.29825182490050794
    Epoch:    4/5     Batch: 1800  Loss: 0.30689597155898807
    Batch:  1815 Time taken: 126.30501437187195
    Epoch:    5/5     Batch: 0  Loss: 0.35189917031675577
    Epoch:    5/5     Batch: 200  Loss: 0.28429420363157987
    Epoch:    5/5     Batch: 400  Loss: 0.2752413671463728
    Epoch:    5/5     Batch: 600  Loss: 0.27909615349024536
    Epoch:    5/5     Batch: 800  Loss: 0.2784998277574778
    Epoch:    5/5     Batch: 1000  Loss: 0.2707124951481819
    Epoch:    5/5     Batch: 1200  Loss: 0.279227427393198
    Epoch:    5/5     Batch: 1400  Loss: 0.28303340304642916
    Epoch:    5/5     Batch: 1600  Loss: 0.2837376609072089
    Epoch:    5/5     Batch: 1800  Loss: 0.27834274955093863
    Batch:  1815 Time taken: 125.30132818222046

    num_epochs = 3
    learning_rate = 0.001
    vocab_size = len(vocab_to_int)
    output_size = len(classes)
    embedding_dim = len(vocab_to_int)//10
    hidden_dim = 256
    n_layers = 1

accuracy = 24.110132158590307

    training on gpu
    Training for 3 epoch(s)...
    Epoch:    1/3     Batch: 0  Loss: 1.948210597038269
    Epoch:    1/3     Batch: 100  Loss: 1.2629362106323243
    Epoch:    1/3     Batch: 200  Loss: 0.9147671985626221
    Epoch:    1/3     Batch: 300  Loss: 0.7619446340203285
    Epoch:    1/3     Batch: 400  Loss: 0.6443701910972596
    Epoch:    1/3     Batch: 500  Loss: 0.5483935502171516
    Epoch:    1/3     Batch: 600  Loss: 0.5587578392028809
    Epoch:    1/3     Batch: 700  Loss: 0.5181590831279754
    Epoch:    1/3     Batch: 800  Loss: 0.5057365174591542
    Epoch:    1/3     Batch: 900  Loss: 0.4920829881727695
    Epoch:    1/3     Batch: 1000  Loss: 0.4556227692961693
    Epoch:    1/3     Batch: 1100  Loss: 0.47841481506824496
    Epoch:    1/3     Batch: 1200  Loss: 0.44066833704710007
    Epoch:    1/3     Batch: 1300  Loss: 0.45724679514765737
    Epoch:    1/3     Batch: 1400  Loss: 0.43142399355769157
    Epoch:    1/3     Batch: 1500  Loss: 0.4343838082253933
    Epoch:    1/3     Batch: 1600  Loss: 0.4362485967576504
    Epoch:    1/3     Batch: 1700  Loss: 0.43836094304919243
    Epoch:    1/3     Batch: 1800  Loss: 0.41377399206161497
    Batch:  1815 Time taken: 115.52667593955994
    Epoch:    2/3     Batch: 0  Loss: 0.37961600814014673
    Epoch:    2/3     Batch: 100  Loss: 0.37738112553954123
    Epoch:    2/3     Batch: 200  Loss: 0.35926029592752456
    Epoch:    2/3     Batch: 300  Loss: 0.3498434928059578
    Epoch:    2/3     Batch: 400  Loss: 0.34742066502571106
    Epoch:    2/3     Batch: 500  Loss: 0.3530371794104576
    Epoch:    2/3     Batch: 600  Loss: 0.37153015077114104
    Epoch:    2/3     Batch: 700  Loss: 0.34142743326723574
    Epoch:    2/3     Batch: 800  Loss: 0.35123591259121895
    Epoch:    2/3     Batch: 900  Loss: 0.35717400774359703
    Epoch:    2/3     Batch: 1000  Loss: 0.3624216615408659
    Epoch:    2/3     Batch: 1100  Loss: 0.3542009150981903
    Epoch:    2/3     Batch: 1200  Loss: 0.3638387350738049
    Epoch:    2/3     Batch: 1300  Loss: 0.35564811766147614
    Epoch:    2/3     Batch: 1400  Loss: 0.3456290701031685
    Epoch:    2/3     Batch: 1500  Loss: 0.35080010920763016
    Epoch:    2/3     Batch: 1600  Loss: 0.36891542047262194
    Epoch:    2/3     Batch: 1700  Loss: 0.35634012162685397
    Epoch:    2/3     Batch: 1800  Loss: 0.3570907928049564
    Batch:  1815 Time taken: 116.11233305931091
    Epoch:    3/3     Batch: 0  Loss: 0.4040226051583886
    Epoch:    3/3     Batch: 100  Loss: 0.2952263676375151
    Epoch:    3/3     Batch: 200  Loss: 0.30173510633409023
    Epoch:    3/3     Batch: 300  Loss: 0.30685905784368517
    Epoch:    3/3     Batch: 400  Loss: 0.3028623998910189
    Epoch:    3/3     Batch: 500  Loss: 0.30551651425659654
    Epoch:    3/3     Batch: 600  Loss: 0.3014995303750038
    Epoch:    3/3     Batch: 700  Loss: 0.322334857955575
    Epoch:    3/3     Batch: 800  Loss: 0.31686400577425955
    Epoch:    3/3     Batch: 900  Loss: 0.3158319129794836
    Epoch:    3/3     Batch: 1000  Loss: 0.3012016559392214
    Epoch:    3/3     Batch: 1100  Loss: 0.31715072341263295
    Epoch:    3/3     Batch: 1200  Loss: 0.31085215888917445
    Epoch:    3/3     Batch: 1300  Loss: 0.2972731766849756
    Epoch:    3/3     Batch: 1400  Loss: 0.30479449197649955
    Epoch:    3/3     Batch: 1500  Loss: 0.30092962466180323
    Epoch:    3/3     Batch: 1600  Loss: 0.31835667058825495
    Epoch:    3/3     Batch: 1700  Loss: 0.32387007847428323
    Epoch:    3/3     Batch: 1800  Loss: 0.32105328977108
    Batch:  1815 Time taken: 115.94001650810242

In [None]:
import re


def clean_str(string):
  """
  Tokenization/string cleaning for all datasets except for SST.
  """
  string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
  string = re.sub(r"\'s", " \'s", string)
  string = re.sub(r"\'ve", " \'ve", string)
  string = re.sub(r"n\'t", " n\'t", string)
  string = re.sub(r"\'re", " \'re", string)
  string = re.sub(r"\'d", " \'d", string)
  string = re.sub(r"\'ll", " \'ll", string)
  string = re.sub(r",", " , ", string)
  string = re.sub(r"!", " ! ", string)
  string = re.sub(r"\(", " ( ", string)
  string = re.sub(r"\)", " ) ", string)
  string = re.sub(r"\?", " ? ", string)
  string = re.sub(r"\s{2,}", " ", string)
  return string.lower().strip().split()


def clean_str_sst(string):
  """
  Tokenization/string cleaning for the SST dataset
  """
  string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
  string = re.sub(r"\s{2,}", " ", string)
  return string.lower().strip().split()