In [1]:
from DataLoader import ReviewDataset

from BaseEncoder import BaseEncoder
from SpanGenerator import SpanGenerator

from SpanMltriLite import SpanMltriLite
from SpanMltri import SpanMltri

import numpy as np
import torch
from torch.utils.data import DataLoader
from torch import nn
from transformers import AutoModel

from seqeval.metrics import classification_report

In [2]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

torch.manual_seed(42)

Using cuda device


<torch._C.Generator at 0x26ab426f110>

In [3]:
TRAIN_FILE_PATH = "dataset/train.tsv"
DEV_FILE_PATH = "dataset/dev.tsv"

# Load Data

In [4]:
def te_prediction_to_iob(prediction, tokens):
    pred_iob = ['O'] * len(tokens)
    current_te_label_dict = {}
    # get all non O predicted span ranges
    for idx, pred_class_id in enumerate(prediction):
        if pred_class_id != 0:
            key = span_ranges[idx]
            current_te_label_dict[key] = label_map[pred_class_id]
            
    for key in current_te_label_dict:
        offsets = [int(offset) for offset in key.split('-')]
        start_offset, end_offset = offsets
        label = current_te_label_dict[key]
        for i in range(start_offset, end_offset+1):
            if i < len(tokens):
                if i == start_offset:
                    pred_iob[i] = f"B-{label}"
                else:
                    pred_iob[i] = f"I-{label}"
    return pred_iob

In [5]:
def te_label_to_iob(current_te_label_dict, tokens):
    pred_iob = ['O'] * len(tokens)
    for key in current_te_label_dict:
        offsets = [int(offset) for offset in key.split('-')]
        start_offset, end_offset = offsets
        label = current_te_label_dict[key]
        for i in range(start_offset, end_offset+1):
            if i < len(tokens):
                if i == start_offset:
                    pred_iob[i] = f"B-{label}"
                else:
                    pred_iob[i] = f"I-{label}"
        
    return pred_iob

In [6]:
label_map = {
        0: 'O',
        1: 'ASPECT',
        2: 'SENTIMENT'
    }

def print_prediction(sentence, class_ids, te_label_dict, span_ranges):
    print(sentence)
    print("Predicted")
    for idx, class_id in enumerate(class_ids):
        if class_id != 0:
            start_offset, end_offset = span_ranges[idx].split('-')
            start_offset, end_offset = int(start_offset), int(end_offset)
            print(sentence[start_offset:end_offset+1], label_map[class_id])
    print()
    print("True")
    for key in te_label_dict.keys():
        start_offset, end_offset = key.split('-')
        start_offset, end_offset = int(start_offset), int(end_offset)
        print(sentence[start_offset:end_offset+1], te_label_dict[key])

In [7]:
MAX_SENTENCE_LENGTH = 40

NUM_OF_TE_LABELS = 3
NUM_OF_PAOTE_LABELS = 4

In [8]:
train_data = ReviewDataset(TRAIN_FILE_PATH, max_sentence_length=MAX_SENTENCE_LENGTH)
dev_data = ReviewDataset(DEV_FILE_PATH, max_sentence_length=MAX_SENTENCE_LENGTH)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transfor

# Full Training

## Hyperparameter Setting

In [None]:
# MODEL_NAME_OR_PATH = "indolem/indobert-base-uncased"
MODEL_NAME_OR_PATH = 'checkpoint-31641'

BATCH_SIZE = 8
MAX_SPAN_LENGTH = 4
K_CANDIDATE = 0.4

In [None]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)
dev_dataloader = DataLoader(dev_data, batch_size=BATCH_SIZE, shuffle=False)

base_encoder = BaseEncoder(MODEL_NAME_OR_PATH).to(device)
span_generator = SpanGenerator(MAX_SPAN_LENGTH).to(device)
model = SpanMltri(d_hidden=768, max_span_length=MAX_SPAN_LENGTH, max_sentence_length=MAX_SENTENCE_LENGTH, num_of_te_class=3, k=K_CANDIDATE).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

lambda_t = 0.5
lambda_r = 0.5

In [None]:
def train(train_dataloader, train_data, model, loss_fn, optimizer):
    size = len(train_dataloader.dataset)
    for batch, X in enumerate(train_dataloader):
        X_tokenized = X.to(device)
        
        current_te_label_dict = train_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_relation_dict = train_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        sentences = train_data.texts[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        
        CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
        X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
        with torch.no_grad():
            X_encoded = base_encoder(X_tokenized)
            X_spans, span_ranges = span_generator(X_encoded)
        
        logits_term_scorer, logits_relation_scorer, span_pair_ranges = model(X_spans, span_ranges)

        # Term Scorer
        y_te_true = []
        for i in range(CURRENT_BATCH_SIZE):
            y_ = []
            for span_range in span_ranges:
                if span_range in current_te_label_dict[i]:
                    label = current_te_label_dict[i][span_range]
                    if label == 'ASPECT':
                        y_.append(1)
                    elif label == 'SENTIMENT':
                        y_.append(2)
                else: # label is O
                    y_.append(0)        
            y_te_true.append(torch.Tensor(y_))
        y_te_true = torch.stack(y_te_true)
        y_te_true = y_te_true.to(torch.long).to(device)
        
        # Relation Scorer
        y_paote_true = []
        for i in range(CURRENT_BATCH_SIZE):
            y_ = []
            for span_pair_range in span_pair_ranges[i]:
                if span_pair_range not in current_relation_dict[i]:
                    y_.append(0)
                else:
                    label = current_relation_dict[i][span_pair_range]
                    if label == 'PO':
                        y_.append(1)
                    elif label == 'NG':
                        y_.append(2)
                    elif label == 'NT':
                        y_.append(3)

            y_paote_true.append(torch.Tensor(y_))
        y_paote_true = torch.stack(y_paote_true)
        y_paote_true = y_paote_true.to(torch.long).to(device)
                
        te_loss = loss_fn(logits_term_scorer.view(-1, NUM_OF_TE_LABELS), y_te_true.view(-1))  
        paote_loss = loss_fn(logits_relation_scorer.view(-1, NUM_OF_PAOTE_LABELS), y_paote_true.view(-1))
        total_loss = lambda_t*te_loss + lambda_r*paote_loss
        
        # Backpropagation
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        if batch % 2 == 0:
            total_loss, current = total_loss.item(), batch * len(X)
            print(f"loss: {te_loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
def test(dev_dataloader, model):
    size = len(dev_dataloader.dataset)
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch, X in enumerate(dev_dataloader):
            X_tokenized = X.to(device)

            current_te_label_dict = dev_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
            current_relation_dict = dev_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
            CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
            
            X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
            X_encoded = base_encoder(X_tokenized)
            X_spans, span_ranges = span_generator(X_encoded)

            logits_term_scorer, logits_relation_scorer, span_pair_ranges = model(X_spans, span_ranges)

            y_te_true = []
            for i in range(CURRENT_BATCH_SIZE):
                y_ = []
                for span_range in span_ranges:
                    if span_range in current_te_label_dict[i]:
                        label = current_te_label_dict[i][span_range]
                        if label == 'ASPECT':
                            y_.append(1)
                        elif label == 'SENTIMENT':
                            y_.append(2)
                    else: # label is O
                        y_.append(0)        
                y_te_true.append(torch.Tensor(y_))
            y_te_true = torch.stack(y_te_true)
            y_te_true = y_te_true.to(torch.long).to(device)

            te_loss = loss_fn(logits_term_scorer.view(-1, NUM_OF_TE_LABELS), y_te_true.view(-1))

            y_paote_true = []
            for i in range(CURRENT_BATCH_SIZE):
                y_ = []
                for span_pair_range in span_pair_ranges[i]:
                    if span_pair_range not in current_relation_dict[i]:
                        y_.append(0)
                    else:
                        label = current_relation_dict[i][span_pair_range]
                        if label == 'PO':
                            y_.append(1)
                        elif label == 'NG':
                            y_.append(2)
                        elif label == 'NT':
                            y_.append(3)
                y_paote_true.append(torch.Tensor(y_))
            y_paote_true = torch.stack(y_paote_true)
            y_paote_true = y_paote_true.to(torch.long).to(device)

            paote_loss = loss_fn(logits_relation_scorer.view(-1, NUM_OF_PAOTE_LABELS), y_paote_true.view(-1))
            total_loss += lambda_t*te_loss.item() + lambda_r*paote_loss.item()

    total_loss /= batch
    print(f"Test Error: \n Avg loss: {total_loss:>8f} \n")
    return total_loss

In [None]:
epochs = 200
patience = 5
epoch_no_improve = 0
early_stop = False
min_val_loss = np.Inf

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, train_data, model, loss_fn, optimizer)
    val_loss = test(dev_dataloader, model)
    if val_loss < min_val_loss:
        epochs_no_improve = 0
        min_val_loss = val_loss
    else:
        epochs_no_improve += 1
    
    if epochs_no_improve == patience:
        print('Early stopping')
        break
print("Done!")

## Evaluation

In [None]:
y_true = []
y_pred = []
with torch.no_grad():
    for batch, X in enumerate(dev_dataloader):
        current_te_label_dict = dev_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_relation_dict = dev_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_sentences = dev_data.texts[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
        
        X_tokenized = X.to(device)
        X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
        X_encoded = base_encoder(X_tokenized)
        X_spans, span_ranges = span_generator(X_encoded)
        
        logits_term_scorer, logits_relation_scorer, span_pair_ranges = model(X_spans, span_ranges)
        
        for idx in range(CURRENT_BATCH_SIZE):
            prediction = logits_term_scorer[idx].argmax(-1).tolist()
            y_pred.append(te_prediction_to_iob(prediction, current_sentences[idx]))
            y_true.append(te_label_to_iob(current_te_label_dict[idx], current_sentences[idx]))

print(classification_report(y_true, y_pred))

In [None]:
y_true = []
y_pred = []
with torch.no_grad():
    for batch, X in enumerate(train_dataloader):
        current_te_label_dict = train_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_relation_dict = train_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_sentences = train_data.texts[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
        
        X_tokenized = X.to(device)
        X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
        X_encoded = base_encoder(X_tokenized)
        X_spans, span_ranges = span_generator(X_encoded)
        
        logits_term_scorer, logits_relation_scorer, span_pair_ranges = model(X_spans, span_ranges)
        
        for idx in range(CURRENT_BATCH_SIZE):
            prediction = logits_term_scorer[idx].argmax(-1).tolist()
            y_pred.append(te_prediction_to_iob(prediction, current_sentences[idx]))
            y_true.append(te_label_to_iob(current_te_label_dict[idx], current_sentences[idx]))

print(classification_report(y_true, y_pred))

# Lite Training

## Hyperparameter Setting

In [25]:
MODEL_NAME_OR_PATH = "indolem/indobert-base-uncased"
# MODEL_NAME_OR_PATH = 'checkpoint-31641'

BATCH_SIZE = 4
MAX_SPAN_LENGTH = 2

In [26]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)
dev_dataloader = DataLoader(dev_data, batch_size=BATCH_SIZE, shuffle=False)

base_encoder = BaseEncoder(MODEL_NAME_OR_PATH).to(device)
span_generator = SpanGenerator(MAX_SPAN_LENGTH).to(device)
model = SpanMltriLite(d_hidden=768, max_span_length=MAX_SPAN_LENGTH, max_sentence_length=MAX_SENTENCE_LENGTH, num_of_te_class=3).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

lambda_t = 1
lambda_r = 1

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
def train(train_dataloader, train_data, model, loss_fn, optimizer):
    size = len(train_dataloader.dataset)
    for batch, X in enumerate(train_dataloader):
        X_tokenized = X.to(device)
        
        current_te_label_dict = train_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_relation_dict = train_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        sentences = train_data.texts[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        
        CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
        X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
        with torch.no_grad():
            X_encoded = base_encoder(X_tokenized)
            X_spans, span_ranges = span_generator(X_encoded)
        
        logits_term_scorer = model(X_spans, span_ranges)

        # Term Scorer
        y_te_true = []
        for i in range(CURRENT_BATCH_SIZE):
            y_ = []
            for span_range in span_ranges:
                if span_range in current_te_label_dict[i]:
                    label = current_te_label_dict[i][span_range]
                    if label == 'ASPECT':
                        y_.append(1)
                    elif label == 'SENTIMENT':
                        y_.append(2)
                else: # label is O
                    y_.append(0)        
            y_te_true.append(torch.Tensor(y_))
        y_te_true = torch.stack(y_te_true)
        y_te_true = y_te_true.to(torch.long).to(device)
                
        te_loss = loss_fn(logits_term_scorer.view(-1, NUM_OF_TE_LABELS), y_te_true.view(-1))  
        
        # Backpropagation
        optimizer.zero_grad()
        te_loss.backward()
        optimizer.step()

        if batch % 2 == 0:
            te_loss, current = te_loss.item(), batch * len(X)
            print(f"loss: {te_loss:>7f}  [{current:>5d}/{size:>5d}]")

In [28]:
def test(dev_dataloader, model):
    size = len(dev_dataloader.dataset)
    model.eval()
    te_loss = 0
    with torch.no_grad():
        for batch, X in enumerate(dev_dataloader):
            X_tokenized = X.to(device)
            current_te_label_dict = dev_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
            current_relation_dict = dev_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
            CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
            
            X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
            X_encoded = base_encoder(X_tokenized)
            X_spans, span_ranges = span_generator(X_encoded)

            logits_term_scorer = model(X_spans, span_ranges)

            y_te_true = []
            for i in range(CURRENT_BATCH_SIZE):
                y_ = []
                for span_range in span_ranges:
                    if span_range in current_te_label_dict[i]:
                        label = current_te_label_dict[i][span_range]
                        if label == 'ASPECT':
                            y_.append(1)
                        elif label == 'SENTIMENT':
                            y_.append(2)
                    else: # label is O
                        y_.append(0)        
                y_te_true.append(torch.Tensor(y_))
            y_te_true = torch.stack(y_te_true)
            y_te_true = y_te_true.to(torch.long).to(device)

            te_loss += loss_fn(logits_term_scorer.view(-1, NUM_OF_TE_LABELS), y_te_true.view(-1))

    te_loss /= batch
    print(f"Test Error: \n Avg loss: {te_loss:>8f} \n")
    return te_loss

In [None]:
epochs = 200

patience = 5
epoch_no_improve = 0
early_stop = False
min_val_loss = np.Inf

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, train_data, model, loss_fn, optimizer)
    val_loss = test(dev_dataloader, model)
    if val_loss < min_val_loss:
        epochs_no_improve = 0
        min_val_loss = val_loss
    else:
        epochs_no_improve += 1
    
    if epochs_no_improve == patience:
        print('Early stopping')
        break
print("Done!")

Epoch 1
-------------------------------
loss: 1.194126  [    0/ 3000]
loss: 1.096619  [    8/ 3000]
loss: 1.012350  [   16/ 3000]
loss: 0.914243  [   24/ 3000]
loss: 1.021622  [   32/ 3000]
loss: 0.856235  [   40/ 3000]
loss: 0.728925  [   48/ 3000]
loss: 0.751521  [   56/ 3000]
loss: 0.708204  [   64/ 3000]
loss: 0.649737  [   72/ 3000]
loss: 0.672062  [   80/ 3000]
loss: 0.561165  [   88/ 3000]
loss: 0.638634  [   96/ 3000]
loss: 0.499108  [  104/ 3000]
loss: 0.517019  [  112/ 3000]
loss: 0.419730  [  120/ 3000]
loss: 0.353486  [  128/ 3000]
loss: 0.475609  [  136/ 3000]
loss: 0.469111  [  144/ 3000]
loss: 0.365101  [  152/ 3000]
loss: 0.358242  [  160/ 3000]
loss: 0.295200  [  168/ 3000]
loss: 0.301968  [  176/ 3000]
loss: 0.359490  [  184/ 3000]
loss: 0.311110  [  192/ 3000]
loss: 0.250811  [  200/ 3000]
loss: 0.294973  [  208/ 3000]
loss: 0.293733  [  216/ 3000]
loss: 0.263731  [  224/ 3000]
loss: 0.343708  [  232/ 3000]
loss: 0.279297  [  240/ 3000]
loss: 0.258930  [  248/ 3000]


## Evaluation

In [None]:
y_true = []
y_pred = []
with torch.no_grad():
    for batch, X in enumerate(dev_dataloader):
        current_te_label_dict = dev_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_relation_dict = dev_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_sentences = dev_data.texts[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
        
        X_tokenized = X.to(device)
        X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
        X_encoded = base_encoder(X_tokenized)
        X_spans, span_ranges = span_generator(X_encoded)
        
        logits_term_scorer = model(X_spans, span_ranges)
        
        for idx in range(CURRENT_BATCH_SIZE):
            prediction = logits_term_scorer[idx].argmax(-1).tolist()
            y_pred.append(te_prediction_to_iob(prediction, current_sentences[idx]))
            y_true.append(te_label_to_iob(current_te_label_dict[idx], current_sentences[idx]))
print(classification_report(y_true, y_pred))

In [None]:
y_true = []
y_pred = []
with torch.no_grad():
    for batch, X in enumerate(train_dataloader):
        current_te_label_dict = train_data.te_label_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_relation_dict = train_data.relation_dict[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        current_sentences = train_data.texts[(batch)*BATCH_SIZE:(batch+1)*BATCH_SIZE]
        CURRENT_BATCH_SIZE = min(len(current_te_label_dict), BATCH_SIZE)
        
        X_tokenized = X.to(device)
        X_tokenized = X_tokenized.reshape(CURRENT_BATCH_SIZE, X_tokenized.shape[-1])
        X_encoded = base_encoder(X_tokenized)
        X_spans, span_ranges = span_generator(X_encoded)
        
        logits_term_scorer = model(X_spans, span_ranges)
        
        for idx in range(CURRENT_BATCH_SIZE):
            prediction = logits_term_scorer[idx].argmax(-1).tolist()
            y_pred.append(te_prediction_to_iob(prediction, current_sentences[idx]))
            y_true.append(te_label_to_iob(current_te_label_dict[idx], current_sentences[idx]))
print(classification_report(y_true, y_pred))