In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from lstm_classifier import LSTMClassifier
from torch.utils.data import DataLoader, TensorDataset
from custom_dataset import CustomDataset
import argparse
from string import punctuation




In [2]:
'''Funciones'''
# Función que levanta el archivo data y lo transforma en una lista de (sentence, label)
def get_data_splitted(data):
    instances = []
    labels = []
    with open(data, 'r') as sentences:
        for line in sentences:
            instances.append(get_sentence_splitted(line))
            labels.append(get_label(line))
    return instances, labels

def get_label(line):
    return line.split()[0].replace('__label__', '')

def get_sentence_splitted(line):
    line_split = line.split();
    res = []
    for val in range(1, len(line_split)):
        res.append(line_split[val])
    return res;

### Función que toma un jsonl y agrega las palabras a mi mapa de word_to_integer
def add_words_to_map(sentences, word_to_ix):
    for sentence in sentences:
        for word in sentence.split():
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    return word_to_ix

### Función que recibe la lista de archivos txt, convierte cada uno en una lista de oraciones de Python y se encarga de llamar a add_words_to_map
def create_map(txt_list):
    word_to_ix = {}
    for input_file in txt_list:
        with open(input_file, 'r') as infile:
            sentences = []
            for line in infile:
                sentences.append(line)
            word_to_ix = add_words_to_map(sentences, word_to_ix)
    return word_to_ix

# Función que crea un vector contando la cantidad de apariciones de las palabras en una oración.
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix)) # Vector de ceros
    for word in sentence.split():
        vec[word_to_ix[word]] += 1 # Por cada aparición de una palabra, le sumo uno
    return vec.view(1, -1) # Vector de tamaño 1 x n, donde n es inferido por el tamaño de palabras

# Función que wrappea la variable en un tensor. Básicamente, le pasas la lista de labels y tu label en particular, y te devuelve un tensor con el valor 0, 1 ó 2 adentro.
def make_target(label, label_to_ix):
    return label_to_ix[label]

def get_label_by_item(item):
    for label, value in label_to_ix.items():
        if(value == item):
            return label
    return None

def calculate_error_rate(predicted, label_batch):
    counter = 0
    ok = 0
    for instance,label in zip(predicted, label_batch):
        if(instance.item() == label.item()):
            ok += 1
        counter += 1

    return ok / counter

def define_batch_size(batch_size, file):
    size = 0
    with open(file, 'r') as infile:
        size = len(infile.readlines())
    while True:
        if(size % batch_size == 0):
            return batch_size
        batch_size = batch_size - 1

def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w], seq))
    return idxs

def get_result_label(result, label_to_ix):
    for label, number in label_to_ix.items():    # for name, age in dictionary.iteritems():  (for Python 2.x)
        if result == number:
            return "__label__"+label

def get_tensor_data(data_inst, data_lab, word_to_ix, label_to_ix, use_labels=True):
    instances = []
    labels = []
    for instance, label in zip(data_inst, data_lab):
        instances.append(prepare_sequence(instance, word_to_ix))
        if(use_labels):
            labels.append(make_target(label, label_to_ix))
        else:
            labels.append(0)
    return instances, labels

In [4]:
# Funciones utilizadas para el collate_fn de CustomDataset
def get_max_length(x):
    return len(max(x, key=len))

def pad_sequence(seq):
    def _pad(_it, _max_len):
        return [0] * (_max_len - len(_it)) + _it
    return [_pad(it, get_max_length(seq)) for it in seq]

def custom_collate(batch):
    transposed = zip(*batch)
    lst = []
    for samples in transposed:
        if isinstance(samples[0], int):
            lst.append(torch.LongTensor(samples))
        elif isinstance(samples[0], float):
            lst.append(torch.DoubleTensor(samples))
        elif isinstance(samples[0], list):
            lst.append(torch.LongTensor(pad_sequence(samples)))
    return lst

#### hyperparams

In [5]:
LOGGING = False
SHUFFLE = True # used to shuffle the trainset before each epoc
# DATA = 30000 # TODO: this datasize is hardcoded

EMBEDDING_DIM = 100
HIDDEN_DIM = 50
BATCH_SIZE = 50
EPOCH_SIZE = 15
CLIP = 5 # normalizing lstm vector values when backpropagating to avoid exploding gradients
LEARNING_RATE = 1
DROPOUT=0.5
NUM_LAYERS=2


#### file paths

In [6]:
DEV_SENTENCES = "./dev_sentences.txt"
TRAIN_SENTENCES = "./train_sentences.txt"
TEST_SENTENCES = "./test_sentences.txt"

DEV_DATA = "./dev_data.txt"
TRAIN_DATA = "./train_data.txt"

#### model

In [7]:
''' Creación del modelo '''
### Defino la cantidad de palabras y la cantidad de labels
label_to_ix = { "neutral": 0, "contradiction": 1, "entailment": 2 }
word_to_ix = create_map([DEV_SENTENCES, TRAIN_SENTENCES, TEST_SENTENCES])
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)

# Creo mi modelo, defino la loss function, y la función de optimización
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_LABELS, BATCH_SIZE, DROPOUT, NUM_LAYERS)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

#### training

In [8]:
'''Entrenamiento'''
# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable (NOTA DE MARISCO: tarda algunos minutos cada vuelta).
instances, labels = get_data_splitted(TRAIN_DATA)
instances, labels = get_tensor_data(instances, labels, word_to_ix, label_to_ix)

# instances = instances[0:DATA]
# labels = labels[0:DATA]

In [9]:
tensor_data = CustomDataset(instances, labels)
train_loader = DataLoader(dataset=tensor_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=custom_collate, drop_last=True) #TODO shuffle?

sentences, labels = next(iter(train_loader))

In [None]:
# send model to tensorboard
with SummaryWriter("./hello_tf_board/") as writer:
    writer.add_graph(model, sentences.transpose(0,1), True) # the transpose makes dims compatible (catofthecannals)

error_rates_per_epoch = []
for epoch in range(EPOCH_SIZE):
    
    print("EPOCH {} STARTED".format(epoch))

    error_rates_per_batch = []
    i = 0
    running_loss = 0.0
    
    for instance_batch, label_batch in train_loader:
        # Step 1. Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()
        model.hidden = model.init_hidden()
        instance_batch = instance_batch.transpose(0,1)

        # Step 2. Run our forward pass
        log_probs = model(instance_batch)

        # Step 3. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(log_probs, label_batch) # gets the a scalar value held in the loss
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), CLIP) # Gradient clip to avoid exploding gradients
        optimizer.step()
        
    # at the end of each epoc, log the loss and the acc of the last batch.
    running_loss = loss.item()
    _, predicted = torch.max(log_probs, 1)
    error_rate = calculate_error_rate(predicted, label_batch)

    with SummaryWriter("./hello_tf_board/") as writer:
        writer.add_scalar('accuracy', error_rate, epoch)
        writer.add_scalar('loss function', running_loss, epoch)
    
    print("EPOCH {} ENDED:  accuracy: {}".format(epoch, error_rate))



  embeds = embeds.view(len(sentence), self.batch_size, -1)


graph(%input.1 : Long(16!, 50!),
      %1 : Float(31997, 100),
      %2 : Float(200, 100),
      %3 : Float(200, 50),
      %4 : Float(200),
      %5 : Float(200),
      %6 : Float(200, 100),
      %7 : Float(200, 50),
      %8 : Float(200),
      %9 : Float(200),
      %10 : Float(200, 100),
      %11 : Float(200, 50),
      %12 : Float(200),
      %13 : Float(200),
      %14 : Float(200, 100),
      %15 : Float(200, 50),
      %16 : Float(200),
      %17 : Float(200),
      %18 : Float(3, 100),
      %19 : Float(3)):
  %20 : Float(16, 50, 100) = onnx::Gather(%1, %input.1), scope: LSTMClassifier/Embedding[word_embeddings]
  %21 : Tensor = onnx::Constant[value= 16  50  -1 [ Variable[CPUType]{3} ]](), scope: LSTMClassifier
  %22 : Float(16, 50, 100) = onnx::Reshape(%20, %21), scope: LSTMClassifier
  %23 : Tensor = onnx::Constant[value=<Tensor>]()
  %24 : Tensor = onnx::Constant[value=<Tensor>]()
  %25 : Tensor? = prim::Constant(), scope: LSTMClassifier/LSTM[lstm]
  %26 : Tensor = onnx::

EPOCH 0 ENDED:  accuracy: 0.68
EPOCH 1 STARTED
EPOCH 1 ENDED:  accuracy: 0.66
EPOCH 2 STARTED
EPOCH 2 ENDED:  accuracy: 0.48
EPOCH 3 STARTED
