<a href="https://colab.research.google.com/github/ronibarylko/meliXos/blob/master/src/train_model_with_tensorboard_use_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from torch.utils.data import DataLoader, Dataset, TensorDataset

import time
import argparse
from string import punctuation


In [0]:
'''Funciones'''
# Función que levanta el archivo data y lo transforma en una lista de (sentence, label)
def get_data_splitted(data):
    instances = []
    labels = []
    with open(data, 'r') as sentences:
        for line in sentences:
            instances.append(get_sentence_splitted(line))
            labels.append(get_label(line))
    return instances, labels

def get_label(line):
    return line.split()[0].replace('__label__', '')

def get_sentence_splitted(line):
    line_split = line.split();
    res = []
    for val in range(1, len(line_split)):
        res.append(line_split[val])
    return res;

### Función que toma un jsonl y agrega las palabras a mi mapa de word_to_integer
def add_words_to_map(sentences, word_to_ix):
    for sentence in sentences:
        for word in sentence.split():
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    return word_to_ix

### Función que recibe la lista de archivos txt, convierte cada uno en una lista de oraciones de Python y se encarga de llamar a add_words_to_map
def create_map(txt_list):
    word_to_ix = {}
    for input_file in txt_list:
        with open(input_file, 'r') as infile:
            sentences = []
            for line in infile:
                sentences.append(line)
            word_to_ix = add_words_to_map(sentences, word_to_ix)
    return word_to_ix

# Función que crea un vector contando la cantidad de apariciones de las palabras en una oración.
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix)) # Vector de ceros
    for word in sentence.split():
        vec[word_to_ix[word]] += 1 # Por cada aparición de una palabra, le sumo uno
    return vec.view(1, -1) # Vector de tamaño 1 x n, donde n es inferido por el tamaño de palabras

# Función que wrappea la variable en un tensor. Básicamente, le pasas la lista de labels y tu label en particular, y te devuelve un tensor con el valor 0, 1 ó 2 adentro.
def make_target(label, label_to_ix):
    return label_to_ix[label]

def get_label_by_item(item):
    for label, value in label_to_ix.items():
        if(value == item):
            return label
    return None

def calculate_error_rate(predicted, label_batch):
    counter = 0
    ok = 0
    for instance,label in zip(predicted, label_batch):
        if(instance.item() == label.item()):
            ok += 1
        counter += 1

    return ok / counter

def define_batch_size(batch_size, file):
    size = 0
    with open(file, 'r') as infile:
        size = len(infile.readlines())
    while True:
        if(size % batch_size == 0):
            return batch_size
        batch_size = batch_size - 1

def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w], seq))
    return idxs

def get_result_label(result, label_to_ix):
    for label, number in label_to_ix.items():    # for name, age in dictionary.iteritems():  (for Python 2.x)
        if result == number:
            return "__label__"+label

def get_tensor_data(data_inst, data_lab, word_to_ix, label_to_ix, use_labels=True):
    instances = []
    labels = []
    for instance, label in zip(data_inst, data_lab):
        instances.append(prepare_sequence(instance, word_to_ix))
        if(use_labels):
            labels.append(make_target(label, label_to_ix))
        else:
            labels.append(0)
    return instances, labels

In [0]:
# Funciones utilizadas para el collate_fn de CustomDataset
def get_max_length(x):
    return len(max(x, key=len))

def pad_sequence(seq):
    def _pad(_it, _max_len):
        return [0] * (_max_len - len(_it)) + _it
    return [_pad(it, get_max_length(seq)) for it in seq]

def custom_collate(batch):
    transposed = zip(*batch)
    lst = []
    for samples in transposed:
        if isinstance(samples[0], int):
            lst.append(torch.LongTensor(samples))
        elif isinstance(samples[0], float):
            lst.append(torch.DoubleTensor(samples))
        elif isinstance(samples[0], list):
            lst.append(torch.LongTensor(pad_sequence(samples)))
    return lst

In [0]:
class CustomDataset(Dataset):
    def __init__(self, instances, labels):
        self.instances = instances
        self.labels = labels

    def __getitem__(self, index):
        return (self.instances[index], self.labels[index])

    def __len__(self):
        return len(self.instances)

In [0]:
class LSTMClassifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_labels, batch_size, dropout=0.5, num_layers=2):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = dropout
        self.num_layers = num_layers
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, bidirectional=True, dropout=dropout, num_layers=self.num_layers) #TODO aca le podemos meter layers, dropout, batch_first

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim*2, num_labels)
        self.batch_size = batch_size
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2*self.num_layers, self.batch_size, self.hidden_dim)),
                autograd.Variable(torch.zeros(2*self.num_layers, self.batch_size, self.hidden_dim))) # La primera dimensión es 2 porque es bidireccional

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        embeds = embeds.view(len(sentence), self.batch_size, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        label_space = self.hidden2label(lstm_out[-1])
        label_scores = F.log_softmax(label_space, dim=1) # Softmax es la única con probabilidad. No tendría sentido algo como ReLu o Sigmoid si tenemos 3 posibilidades
        return label_scores

#### hyperparams

In [0]:
LOGGING = False
SHUFFLE = True # used to shuffle the trainset before each epoc
# DATA = 30000 # TODO: this datasize is hardcoded

EMBEDDING_DIM = 100
HIDDEN_DIM = 50
BATCH_SIZE = 50
EPOCH_SIZE = 15
CLIP = 5 # normalizing lstm vector values when backpropagating to avoid exploding gradients
LEARNING_RATE = 1
DROPOUT=0.5
NUM_LAYERS=2

use_cuda = True # use GPU


#### file paths
The following lines hardcode path to your specific google drive.

In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
dataset_path= "/content/drive/My Drive/eci_19_nlp/dataset/"

In [0]:
DEV_SENTENCES = dataset_path + "dev_sentences.txt"
TRAIN_SENTENCES = dataset_path + "/train_sentences.txt"
TEST_SENTENCES = dataset_path + "test_sentences.txt"

DEV_DATA = dataset_path + "dev_data.txt"
TRAIN_DATA = dataset_path + "train_data.txt"

#### model

In [0]:
''' Creación del modelo '''
### Defino la cantidad de palabras y la cantidad de labels
label_to_ix = { "neutral": 0, "contradiction": 1, "entailment": 2 }
word_to_ix = create_map([DEV_SENTENCES, TRAIN_SENTENCES, TEST_SENTENCES])
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)

# Creo mi modelo, defino la loss function, y la función de optimización
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_LABELS, BATCH_SIZE, DROPOUT, NUM_LAYERS)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [11]:
if use_cuda and torch.cuda.is_available():
  model = model.cuda()
  print("Model loaded to GPU")

Model loaded to GPU


In [12]:
model

LSTMClassifier(
  (word_embeddings): Embedding(53345, 100)
  (lstm): LSTM(100, 50, num_layers=2, dropout=0.5, bidirectional=True)
  (hidden2label): Linear(in_features=100, out_features=3, bias=True)
)

#### training

In [0]:
'''Entrenamiento'''
# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable (NOTA DE MARISCO: tarda algunos minutos cada vuelta).
instances, labels = get_data_splitted(TRAIN_DATA)
instances, labels = get_tensor_data(instances, labels, word_to_ix, label_to_ix)

# instances = instances[0:DATA]
# labels = labels[0:DATA]

In [0]:
tensor_data = CustomDataset(instances, labels)
train_loader = DataLoader(dataset=tensor_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=custom_collate, drop_last=True)

sentences, labels = next(iter(train_loader))

In [16]:
# send model to tensorboard
'''with SummaryWriter("./hello_tf_board/") as writer:
    writer.add_graph(model, sentences.transpose(0,1), True) # the transpose makes dims compatible (catofthecannals)
'''
error_rates_per_epoch = []
for epoch in range(EPOCH_SIZE):
    t = time.time()
    print("EPOCH {} STARTED".format(epoch))

    error_rates_per_batch = []
    i = 0
    running_loss = 0.0
    
    for instance_batch, label_batch in train_loader:
      
        # load data to GPU
        if use_cuda and torch.cuda.is_available():
            instance_batch, label_batch = instance_batch.cuda(), label_batch.cuda()
      
        # Step 1. Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()
        
        if use_cuda and torch.cuda.is_available():
            # model.hidden = tuple((elem.cuda() for elem in model.hidden))
            model.hidden = (model.hidden[0].cuda(), model.hidden[1].cuda())

        instance_batch = instance_batch.transpose(0,1)

        # Step 2. Run our forward pass
        log_probs = model(instance_batch)

        # Step 3. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(log_probs, label_batch) # gets the a scalar value held in the loss
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(model.parameters(), CLIP) # Gradient clip to avoid exploding gradients
        optimizer.step()
        
    # at the end of each epoc, log the loss and the acc of the last batch.
    running_loss = loss.item()
    _, predicted = torch.max(log_probs, 1)
    error_rate = calculate_error_rate(predicted, label_batch)

    with SummaryWriter("./hello_tf_board/") as writer:
        writer.add_scalar('accuracy', error_rate, epoch)
        writer.add_scalar('loss function', running_loss, epoch)
        
    elapsed_time = time.time() - t
    
    print("EPOCH {} ENDED:  accuracy: {}, took {} s".format(epoch, error_rate, elapsed_time))



EPOCH 0 STARTED


RuntimeError: ignored