In [1]:
from data_utils import set_up_data_loader, MultimodalConfig
# from bert import MAG_BertForSequenceClassification
# from miniLM import MAG_BertForSequenceClassification
# import prompt_based.minilm.minilm
import prompt_based.distilbert
import normal.bert
from transformers import get_linear_schedule_with_warmup
from transformers.optimization import AdamW
from globals import *

# import torch.nn as nn
from torch.optim import Adam
from torch.nn import MSELoss
from tqdm.notebook import tqdm


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score

import numpy as np

In [2]:
def prep_for_training(model, num_train_optimization_steps: int, learning_rate):
    model.to(DEVICE)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.01,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    optimizer = Adam(optimizer_grouped_parameters, lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_proportion * num_train_optimization_steps,
        num_training_steps=num_train_optimization_steps,
    )
    # scheduler = None
    return model, optimizer, scheduler

In [3]:
def train_epoch(model, train_dataloader, optimizer, scheduler):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(DEVICE) for t in batch)
        input_ids, visual, acoustic, input_mask, segment_ids, label_ids = batch
        visual = torch.squeeze(visual, 1)
        acoustic = torch.squeeze(acoustic, 1)
        outputs = model(
            input_ids,
            visual,
            acoustic,
            # token_type_ids=segment_ids,
            attention_mask=input_mask,
            labels=None,
        )
        logits = outputs[0]
        loss_fct = MSELoss()
        loss = loss_fct(logits.view(-1), label_ids.view(-1))

        if gradient_accumulation_step > 1:
            loss = loss / gradient_accumulation_step

        loss.backward()

        tr_loss += loss.item()
        nb_tr_steps += 1

        if (step + 1) % gradient_accumulation_step == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    return tr_loss / nb_tr_steps

In [4]:
def eval_epoch(model, dev_dataloader, optimizer):
    model.eval()
    dev_loss = 0
    nb_dev_examples, nb_dev_steps = 0, 0
    with torch.no_grad():
        for step, batch in enumerate(dev_dataloader):
            batch = tuple(t.to(DEVICE) for t in batch)

            input_ids, visual, acoustic, input_mask, segment_ids, label_ids = batch
            # print(segment_ids)
            visual = torch.squeeze(visual, 1)
            acoustic = torch.squeeze(acoustic, 1)
            outputs = model(
                input_ids,
                visual,
                acoustic,
                # token_type_ids=segment_ids,
                attention_mask=input_mask,
                labels=None,
            )
            logits = outputs[0]

            loss_fct = MSELoss()
            loss = loss_fct(logits.view(-1), label_ids.view(-1))

            if gradient_accumulation_step > 1:
                loss = loss / gradient_accumulation_step

            dev_loss += loss.item()
            nb_dev_steps += 1

    return dev_loss / nb_dev_steps

In [5]:
def test_score_model(model, test_dataloader, use_zero=False):

    preds, y_test = test_epoch(model, test_dataloader)
    non_zeros = np.array(
        [i for i, e in enumerate(y_test) if e != 0 or use_zero])

    preds = preds[non_zeros]
    y_test = y_test[non_zeros]

    mae = np.mean(np.absolute(preds - y_test))
    corr = np.corrcoef(preds, y_test)[0][1]

    preds = preds >= 0
    y_test = y_test >= 0

    f_score = f1_score(y_test, preds, average="weighted")
    acc = accuracy_score(y_test, preds)

    return acc, mae, corr, f_score

In [6]:
def test_epoch(model, test_dataloader):
    model.eval()
    preds = []
    labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            batch = tuple(t.to(DEVICE) for t in batch)

            input_ids, visual, acoustic, input_mask, segment_ids, label_ids = batch
            visual = torch.squeeze(visual, 1)
            acoustic = torch.squeeze(acoustic, 1)
            outputs = model(
                input_ids,
                visual,
                acoustic,
                # token_type_ids=segment_ids,
                attention_mask=input_mask,
                labels=None,
            )

            logits = outputs[0]

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.detach().cpu().numpy()

            logits = np.squeeze(logits).tolist()
            label_ids = np.squeeze(label_ids).tolist()

            preds.extend(logits)
            labels.extend(label_ids)

        preds = np.array(preds)
        labels = np.array(labels)

    return preds, labels

In [7]:
def train(
    model,
    train_dataloader,
    validation_dataloader,
    test_data_loader,
    optimizer,
    scheduler
):
    # valid_losses = []
    # test_accuracies = []

    for epoch_i in range(int(n_epochs)):
        train_loss = train_epoch(model, train_dataloader, optimizer, scheduler)
        valid_loss = eval_epoch(model, validation_dataloader, optimizer)
        test_acc, test_mae, test_corr, test_f_score = test_score_model(
            model, test_data_loader
        )

        print(
            "epoch:{}, train_loss:{} valid_loss: {}, test: {}".format(
                epoch_i, train_loss, valid_loss, test_mae
            )
        )

In [8]:
(
    train_data_loader,
    dev_data_loader,
    test_data_loader,
    num_train_optimization_steps,
) = set_up_data_loader()

  all_visual = torch.tensor([f.visual for f in features], dtype=torch.float)


In [9]:
multimodal_config = MultimodalConfig(1.0, 0.5)
model = prompt_based.distilbert.AVPrompt_DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", multimodal_config=multimodal_config, num_labels=1,
)

# frozen_modules = [model.bert.embeddings, model.bert.encoder]
# for module in frozen_modules:
#     for param in module.parameters():
#         param.requires_grad = False

model, optimizer, scheduler = prep_for_training(model, num_train_optimization_steps, learning_rate=1e-2)
train(model, train_data_loader, dev_data_loader, test_data_loader, optimizer, scheduler)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing AVPrompt_DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing AVPrompt_DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AVPrompt_DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AVPrompt_DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['distilbert.mag.Lay

Initializing MAG with beta_shift:1.0 hidden_prob:0.5


KeyboardInterrupt: 

In [10]:
test_score_model(model, test_data_loader)

(0.8229007633587786,
 0.8649354306810344,
 0.7418325494026404,
 0.8230645190397269)

In [11]:
multimodal_config = MultimodalConfig(1.0, 0.5)
model = normal.bert.MAG_BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', multimodal_config=multimodal_config, num_labels=1,
)

# frozen_modules = [model.distilbert.embeddings, model.distilbert.transformer]
# for module in frozen_modules:
#     for param in module.parameters():
#         param.requires_grad = False

model, optimizer, scheduler = prep_for_training(model, num_train_optimization_steps, learning_rate=1e-5)
train(model, train_data_loader, dev_data_loader, test_data_loader, optimizer, scheduler)

Some weights of MAG_MiniLMForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['mag.W_a.bias', 'mag.W_a.weight', 'mag.LayerNorm.bias', 'mag.beta_shift', 'mag.W_ha.bias', 'classifier.weight', 'mag.W_v.weight', 'mag.W_v.bias', 'mag.LayerNorm.weight', 'classifier.bias', 'mag.W_ha.weight', 'mag.W_hv.bias', 'mag.W_hv.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initializing MAG with beta_shift:1.0 hidden_prob:0.5
epoch:0, train_loss:2.319864531842674 valid_loss: 2.7820749282836914, test: 1.4851906323165611
epoch:1, train_loss:2.4142271687344805 valid_loss: 2.7926435470581055, test: 1.49044872877252
epoch:2, train_loss:2.2572515912172273 valid_loss: 2.7407796382904053, test: 1.5007158292955114
epoch:3, train_loss:2.4387300479702834 valid_loss: 2.7242801189422607, test: 1.507470249083206
epoch:4, train_loss:2.306977463931572 valid_loss: 2.7018152475357056, test: 1.506412047466249
epoch:5, train_loss:2.275074072000457 valid_loss: 2.705891489982605, test: 1.5132590400126145
epoch:6, train_loss:2.2081837137116165 valid_loss: 2.710377812385559, test: 1.5169823122388535
epoch:7, train_loss:2.3671285873506127 valid_loss: 2.6926037073135376, test: 1.5130032903365507
epoch:8, train_loss:2.2341104774940304 valid_loss: 2.7082098722457886, test: 1.50752694860442
epoch:9, train_loss:2.1957710603388345 valid_loss: 2.627762794494629, test: 1.4893595833052304

In [12]:
test_score_model(model, test_data_loader)

(0.7984732824427481,
 1.0128087346463372,
 0.7300915861120416,
 0.7996784048790345)

In [13]:
# multimodal_config = MultimodalConfig(1.0, 0.5)
# model = distilbert.MAG_DistilBertForSequenceClassification.from_pretrained(
#     "distilbert-base-uncased", multimodal_config=multimodal_config, num_labels=1,
# )

# model, optimizer, scheduler = prep_for_training(model, num_train_optimization_steps, learning_rate=1e-5)
# train(model, train_data_loader, dev_data_loader, test_data_loader, optimizer, scheduler)

In [14]:
# test_score_model(model, test_data_loader)