# **Imports**

In [None]:
!pip install transformers

In [1]:
import json
import torch
import os
import re
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
import os

gdrive_path='/content/gdrive/MyDrive/sup_model'

# This will mount your google drive under 'MyDrive'
drive.mount('/content/gdrive', force_remount=True)
# In order to access the files in this notebook we have to navigate to the correct folder
os.chdir(gdrive_path)
# Check manually if all files are present
print(sorted(os.listdir()))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## **Dataset**

In [None]:
from ArgEvi_dataset import ArgEvi


file_path = 'pos_neg_pairs_train.csv'

# list of [argument, positive evidence, 5 negative evidences]
dataset = ArgEvi(file_path)

# split dataset into training, validation dataset
size = len(dataset)
train_size = int(size * 0.8)
val_size = int(size * 0.2)
#print(size)

train_dataset = dataset[:train_size]
val_dataset = dataset[train_size:(train_size + val_size)]

train_arguments = []
train_pos_evi = []
train_neg_evis = []

for sample in train_dataset:
  train_arguments.append(sample[0])
  train_pos_evi.append(sample[1])
  train_neg_evis.append(sample[2:])     # list of 5 negative evidences

#print((train_arguments[0], train_pos_evi[0], train_neg_evis[0]))

val_arguments = []
val_pos_evi = []
val_neg_evis = []

for sample in val_dataset:
  val_arguments.append(sample[0])
  val_pos_evi.append(sample[1])
  val_neg_evis.append(sample[2:])


# **Tokenization**

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_arg_tokens = tokenizer(train_arguments, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
train_pos_tokens = tokenizer(train_pos_evi, return_tensors='pt', padding=True, truncation=True, max_length=512)

input_ids = torch.stack([
    torch.stack([tokenizer(neg, return_tensors='pt', padding='max_length', truncation=True, max_length=512)['input_ids'].squeeze(0) for neg in negs])
    for negs in train_neg_evis
])

attention_masks = torch.stack([
    torch.stack([tokenizer(neg, return_tensors='pt', padding='max_length', truncation=True, max_length=512)['attention_mask'].squeeze(0) for neg in negs])
    for negs in train_neg_evis
])

train_neg_tokens = {
    'input_ids': input_ids,
    'attention_mask': attention_masks
}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
val_arg_tokens = tokenizer(val_arguments, truncation=True, padding='max_length', max_length=512, return_tensors="pt")

val_pos_tokens = tokenizer(val_pos_evi, return_tensors='pt', padding=True, truncation=True, max_length=512)

val_input_ids = torch.stack([
    torch.stack([tokenizer(neg, return_tensors='pt', padding='max_length', truncation=True, max_length=512)['input_ids'].squeeze(0) for neg in negs])
    for negs in val_neg_evis
])

val_attention_masks = torch.stack([
    torch.stack([tokenizer(neg, return_tensors='pt', padding='max_length', truncation=True, max_length=512)['attention_mask'].squeeze(0) for neg in negs])
    for negs in val_neg_evis
])

val_neg_tokens = {
    'input_ids': val_input_ids,
    'attention_mask': val_attention_masks
}

# **Dataloader**

In [None]:
from tokenized_data import TokenizedDataset

train_tokenized_dataset = TokenizedDataset(train_arg_tokens, train_pos_tokens, train_neg_tokens)
val_tokenized_dataset = TokenizedDataset(val_arg_tokens, val_pos_tokens, val_neg_tokens)

batch_size = 16

train_dataloader = DataLoader(train_tokenized_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_tokenized_dataset, batch_size=batch_size, shuffle=True)



# **Training**

In [None]:
from Roberta import ContrastiveRoberta
from torch import nn
import torch.optim as optim
from contrastive_loss import ContrastiveLoss
import torch.nn.functional as F

# training objective: learn embeddings such that those from supportive argument-evidence pairs are closer together
#.      than those from non-supportive pairs.

def evaluate(arg_emb, pos_emb, neg_embs):
    pos_sim = F.cosine_similarity(arg_emb.unsqueeze(1), pos_emb.unsqueeze(0), dim=-1)
    avg_pos_sim = torch.mean(pos_sim).item()

    batch_size, num_negatives, emb_dim = neg_embs.shape
    neg_embs_flat = neg_embs.view(batch_size * num_negatives, emb_dim)
    neg_sim = F.cosine_similarity(arg_emb.unsqueeze(1), neg_embs_flat.unsqueeze(0), dim=-1)
    avg_neg_sim = torch.mean(neg_sim).item()

    return avg_pos_sim, avg_neg_sim


def train(model: nn.Module,
          train_dataloader: DataLoader,
          val_dataloader: DataLoader,
          optimizer: optim.Optimizer,
          contrastive_loss,
          num_epochs: int,
          device: torch.device
          ):

    train_contrastive_loss = []
    val_contrastive_loss = []

    # early stopping setting
    patience = 2
    best_val_loss = float('inf')
    count = 0   # number of epoches with no reduction of val_loss


    for epoch in range(num_epochs):
        model.to(device)

        model.train()
        running_loss = 0.0
        epoch_loss = 0.0

        for i, data in enumerate(train_dataloader, 0):
            arg_tokens, pos_tokens, neg_tokens = data

            arg_tokens = {key: val.to(device) for key, val in arg_tokens.items()}
            pos_tokens = {key: val.to(device) for key, val in pos_tokens.items()}
            neg_tokens = {key: val.to(device) for key, val in neg_tokens.items()}


            optimizer.zero_grad()
            arg_emb, pos_emb, neg_embs = model.forward(arg_tokens, pos_tokens, neg_tokens)
            loss = contrastive_loss(arg_emb, pos_emb, neg_embs)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_loss += loss.item()

            if i % 10 == 9:  # print every 10 mini-batches
                running_loss /= 10   # average loss of 10 mini-batches
                print(
                    f"[Epoch {epoch+1}, Iteration {i+1}] Training Loss: {running_loss:.3f}"
                )
                running_loss = 0.0

        # training loss for this epoch
        epoch_loss /= len(train_dataloader)
        train_contrastive_loss.append(epoch_loss)

        # validation
        model.eval()
        val_running_loss = 0.0
        avg_pos_sim = 0.0
        avg_neg_sim = 0.0

        with torch.no_grad():
          for val_data in val_dataloader:
              arg_tokens, pos_tokens, neg_tokens = val_data

              arg_tokens = {key: val.to(device) for key, val in arg_tokens.items()}
              pos_tokens = {key: val.to(device) for key, val in pos_tokens.items()}
              neg_tokens = {key: val.to(device) for key, val in neg_tokens.items()}

              arg_emb, pos_emb, neg_embs = model.forward(arg_tokens, pos_tokens, neg_tokens)
              val_loss = contrastive_loss(arg_emb, pos_emb, neg_embs)
              val_running_loss += val_loss.item()

              pos_sim, neg_sim = evaluate(arg_emb, pos_emb, neg_embs)
              avg_pos_sim += pos_sim
              avg_neg_sim += neg_sim

        val_epoch_loss = val_running_loss / len(val_dataloader)
        val_contrastive_loss.append(val_epoch_loss)
        avg_pos_sim /= len(val_dataloader)
        avg_neg_sim /= len(val_dataloader)
        print(f"Epoch {epoch+1}, Training Loss: {epoch_loss:.3f}, Validation Loss: {val_epoch_loss:.3f}, Avg_Positive_Similarity: {avg_pos_sim:.3f}, Avg_Negative_Similarity: {avg_neg_sim:.3f}")

        # early stopping
        if val_epoch_loss < best_val_loss:
            best_val_loss = val_epoch_loss
            count = 0
            torch.save(model.state_dict(), 'best_model.pt')   # save best model
        else:
            count += 1

        if count == patience:
            print('Early stopping !')
            break

    print("FINISH.")

    return train_contrastive_loss, val_contrastive_loss


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ContrastiveRoberta()
# load the saved best model parameters to continue training
model.load_state_dict(torch.load('best_model.pt'))
#model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

loss_func = ContrastiveLoss(model, temperature=0.1)

train_loss, val_loss = train(model=model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, optimizer=optimizer,
                   contrastive_loss=loss_func, num_epochs=5, device=device)






Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1, Iteration 10] Training Loss: 0.011
[Epoch 1, Iteration 20] Training Loss: 0.015
[Epoch 1, Iteration 30] Training Loss: 0.029
Epoch 1, Training Loss: 0.018, Validation Loss: 0.170, Avg_Positive_Similarity: 0.244, Avg_Negative_Similarity: -0.151
[Epoch 2, Iteration 10] Training Loss: 0.005
[Epoch 2, Iteration 20] Training Loss: 0.008
[Epoch 2, Iteration 30] Training Loss: 0.019
Epoch 2, Training Loss: 0.010, Validation Loss: 0.205, Avg_Positive_Similarity: 0.235, Avg_Negative_Similarity: -0.190
[Epoch 3, Iteration 10] Training Loss: 0.005
[Epoch 3, Iteration 20] Training Loss: 0.005
[Epoch 3, Iteration 30] Training Loss: 0.006
Epoch 3, Training Loss: 0.005, Validation Loss: 0.179, Avg_Positive_Similarity: 0.198, Avg_Negative_Similarity: -0.173
Early stopping !
FINISH.


In [None]:
import matplotlib.pyplot as plt

# visualization of the training process

epochs = range(1, len(train_loss) + 1)

plt.plot(epochs, train_loss, label='Training Loss')
plt.plot(epochs, val_loss, label='Validation Loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

# **Test case**

In [4]:
from Roberta import SupportScoreModel
from transformers import RobertaTokenizer
from Roberta import ContrastiveRoberta

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

contrastive_model = ContrastiveRoberta()
contrastive_model.load_state_dict(torch.load('best_model.pt'))

support_model = SupportScoreModel(contrastive_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
arg = "Due to their wealth, high-income countries have the financial ability and a special obligation to take in refugees."
sup_evi = "According to the World Bank, high-income countries, which account for about 16% of the global population, hold over 75% of the world’s wealth."
bad_evi = "A 2019 study by the Center for Immigration Studies argued that the long-term fiscal impact of refugees on high-income countries can be negative, citing an average net fiscal cost of $60,000 per refugee over a 20-year period."
# tokenization
arg_tokens = tokenizer(arg, return_tensors='pt')
sup_evi_tokens = tokenizer(sup_evi, return_tensors='pt')
bad_evi_tokens = tokenizer(bad_evi, return_tensors='pt')

# compute support score for each evidence
support_model.eval()
with torch.no_grad():
    support_score_1 = support_model(arg_tokens, sup_evi_tokens)
    support_score_2 = support_model(arg_tokens, bad_evi_tokens)

print(f"Support Score 1: {support_score_1.item()}, Support Score 2: {support_score_2.item()}")

Support Score 1: 0.8579477071762085, Support Score 2: -0.4347463846206665
