In [1]:
from DataLoader import read_examples_from_file, ReviewDataset

from BaseEncoder import BaseEncoder
from SpanGenerator import SpanGenerator

from SpanMltriLite import SpanMltriLite


import torch
from torch.utils.data import DataLoader
from torch import nn
from transformers import AutoModel

In [2]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


In [3]:
TRAIN_FILE_PATH = "dataset/train.tsv"
DEV_FILE_PATH = "dataset/dev.tsv"

# Load Data

In [4]:
MODEL_NAME_OR_PATH = "indolem/indobert-base-uncased"
BATCH_SIZE = 32
MAX_SPAN_LENGTH = 8
MAX_SENTENCE_LENGTH = 40

NUM_OF_TE_LABELS = 3
NUM_OF_PAOTE_LABELS = 4

In [5]:
train_data = ReviewDataset(TRAIN_FILE_PATH, max_sentence_length=MAX_SENTENCE_LENGTH)
dev_data = ReviewDataset(DEV_FILE_PATH, max_sentence_length=MAX_SENTENCE_LENGTH)

In [6]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)
dev_dataloader = DataLoader(dev_data, batch_size=BATCH_SIZE, shuffle=False)

base_encoder = BaseEncoder(MODEL_NAME_OR_PATH).to(device)
span_generator = SpanGenerator(MAX_SPAN_LENGTH).to(device)
model = SpanMltriLite(d_hidden=768, max_sentence_length=MAX_SENTENCE_LENGTH, num_of_te_class=3).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

lambda_t = 0.5
lambda_r = 0.5

In [7]:
def train(train_dataloader, train_data, model, loss_fn, optimizer):
    size = len(train_dataloader.dataset)
    for batch, X in enumerate(train_dataloader):
        X_tokenized = X.to(device)
        CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
        X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
        current_te_label_dict = train_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_relation_dict = train_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        sentences = train_data.texts[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]

        with torch.no_grad():
            X_encoded = base_encoder(X_tokenized)
            X_spans, span_ranges = span_generator(X_encoded)
        
        logits_term_scorer, logits_relation_scorer, span_pair_ranges = model(X_spans, span_ranges)

        # Term Scorer
        y_te_true = []
        for i in range(CURRENT_BATCH_SIZE):
            y_ = []
            for span_range in span_ranges:
                if span_range in current_te_label_dict[i]:
                    label = current_te_label_dict[i][span_range]
                    if label == 'ASPECT':
                        y_.append(1)
                    elif label == 'SENTIMENT':
                        y_.append(2)
                else: # label is O
                    y_.append(0)        
            y_te_true.append(torch.Tensor(y_))
        y_te_true = torch.stack(y_te_true)
        y_te_true = y_te_true.to(torch.long).to(device)
        
        # Relation Scorer
        y_paote_true = []
        CURRENT_BATCH_SIZE = min(len(current_relation_dict), BATCH_SIZE)
        for i in range(CURRENT_BATCH_SIZE):
            y_ = []
            for span_pair_range in span_pair_ranges[i]:
                if span_pair_range not in current_relation_dict[i]:
                    y_.append(0)
                else:
                    label = current_relation_dict[i][span_pair_range]
                    if label == 'PO':
                        y_.append(1)
                    elif label == 'NG':
                        y_.append(2)
                    elif label == 'NT':
                        y_.append(3)
            if len(y_) == 12995:
                print(sentences[i])
                print(i)
            y_paote_true.append(torch.Tensor(y_))
        y_paote_true = torch.stack(y_paote_true)
        y_paote_true = y_paote_true.to(torch.long).to(device)
                
        te_loss = loss_fn(logits_term_scorer.view(-1, NUM_OF_TE_LABELS), y_te_true.view(-1))  
        paote_loss = loss_fn(logits_relation_scorer.view(-1, NUM_OF_PAOTE_LABELS), y_paote_true.view(-1))
        total_loss = lambda_t*te_loss + lambda_r*paote_loss
        
        # Backpropagation
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        if batch % 2 == 0:
            total_loss, current = total_loss.item(), batch * len(X)
            print(f"loss: {te_loss:>7f}  [{current:>5d}/{size:>5d}]")

In [8]:
def test(dev_dataloader, model):
    size = len(dev_dataloader.dataset)
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch, X in enumerate(dev_dataloader):
            X_tokenized = X.to(device)
            CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
            X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])

            current_te_label_dict = dev_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
            current_relation_dict = dev_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]

            X_encoded = base_encoder(X_tokenized)
            X_spans, span_ranges = span_generator(X_encoded)

            logits_term_scorer, logits_relation_scorer, span_pair_ranges = model(X_spans, span_ranges)

            y_te_true = []
            for i in range(CURRENT_BATCH_SIZE):
                y_ = []
                for span_range in span_ranges:
                    if span_range in current_te_label_dict[i]:
                        label = current_te_label_dict[i][span_range]
                        if label == 'ASPECT':
                            y_.append(1)
                        elif label == 'SENTIMENT':
                            y_.append(2)
                    else: # label is O
                        y_.append(0)        
                y_te_true.append(torch.Tensor(y_))
            y_te_true = torch.stack(y_te_true)
            y_te_true = y_te_true.to(torch.long)

            te_loss = loss_fn(logits_term_scorer.view(-1), y_te_true.view(-1))

            y_paote_true = []
            CURRENT_BATCH_SIZE = min(len(current_relation_dict), BATCH_SIZE)
            for i in range(CURRENT_BATCH_SIZE):
                y_ = []
                for span_pair_range in span_pair_ranges[i]:
                    if span_pair_range not in current_relation_dict[i]:
                        y_.append(0)
                    else:
                        label = current_relation_dict[i][span_pair_range]
                        if label == 'PO':
                            y_.append(1)
                        elif label == 'NG':
                            y_.append(2)
                        elif label == 'NT':
                            y_.append(3)
                y_paote_true.append(torch.Tensor(y_))
            y_paote_true = torch.stack(y_paote_true)
            y_paote_true = y_paote_true.to(torch.long)

            paote_loss = loss_fn(logits_relation_scorer.view(-1, 4), y_paote_true.view(-1))
            total_loss += lambda_t*te_loss.item() + lambda_r*paote_loss.item()

    total_loss /= batch
    print(f"Test Error: \n Avg loss: {total_loss:>8f} \n")

In [9]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, train_data, model, loss_fn, optimizer)
    test(dev_dataloader, model)
print("Done!")

Epoch 1
-------------------------------
loss: 0.893322  [    0/ 3000]
loss: 0.955730  [   64/ 3000]
loss: 1.010346  [  128/ 3000]
loss: 0.762366  [  192/ 3000]
loss: 0.866403  [  256/ 3000]
loss: 0.742545  [  320/ 3000]
loss: 0.754140  [  384/ 3000]
loss: 0.772195  [  448/ 3000]
loss: 0.853727  [  512/ 3000]
loss: 0.699944  [  576/ 3000]
loss: 0.834684  [  640/ 3000]
loss: 0.434484  [  704/ 3000]
loss: 0.584678  [  768/ 3000]
loss: 0.627223  [  832/ 3000]
loss: 0.605031  [  896/ 3000]
loss: 0.642448  [  960/ 3000]
loss: 0.528821  [ 1024/ 3000]
loss: 0.428929  [ 1088/ 3000]
loss: 0.471235  [ 1152/ 3000]
loss: 0.348709  [ 1216/ 3000]
loss: 0.411387  [ 1280/ 3000]
loss: 0.460095  [ 1344/ 3000]
loss: 0.445073  [ 1408/ 3000]
loss: 0.371337  [ 1472/ 3000]
loss: 0.603436  [ 1536/ 3000]
loss: 0.342959  [ 1600/ 3000]
loss: 0.318633  [ 1664/ 3000]
loss: 0.287195  [ 1728/ 3000]
loss: 0.393272  [ 1792/ 3000]
loss: 0.487518  [ 1856/ 3000]
loss: 0.454249  [ 1920/ 3000]
loss: 0.338928  [ 1984/ 3000]


RuntimeError: shape '[32, 40]' is invalid for input of size 960

In [None]:
with torch.no_grad():
    for batch, X in enumerate(dev_dataloader):
        X_tokenized = X.to(device)
        X_tokenized = X_tokenized.reshape(BATCH_SIZE, X_tokenized.shape[-1])

        current_te_label_dict = dev_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_relation_dict = dev_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        sentences = train_data.texts[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        
        X_encoded = base_encoder(X_tokenized)
        X_spans, span_ranges = span_generator(X_encoded)
        
        logits_term_scorer, logits_relation_scorer, span_pair_ranges = model(X_spans, span_ranges)
        
        print(sentences[:5])
        print(logits_term_scorer[:5].argmax(dim=-1))
        print(logits_relation_scorer[:5].argmax(dim=-1))
        print(span_ranges)
        print(span_pair_ranges[0])
        break