# BART
BART is a denoising autoencoder for pretraining sequence-to-sequence models. 

BART uses a standard Transformer architecture (Encoder-Decoder) and is a combination of BERT, which is only encoder-model and GPT, which is a decoder-only model.</br>
# Pre-Training BART
BART is pre-trained by minimizing the cross-entropy loss between the decoder output and the original sequence.

# Set up the Environment

In [1]:
import torch
print(torch.__version__)

1.9.0+cu102


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 7.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.7 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 2.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 65.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 76.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [None]:
# Some helper functions here

In [4]:
def shift_tokens_right(input_ids, pad_token_id):
    """Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""
    prev_output_tokens = input_ids.clone()
    index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
    prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
    prev_output_tokens[:, 1:] = input_ids[:, :-1]
    return prev_output_tokens

In [5]:
def convert_to_features(example_batch):
  input_encodings=tokenizer.batch_encode_plus(example_batch['input_text'],pad_to_max_length=True, max_length=1024, truncation=True)
  target_encodings=tokenizer.batch_encode_plus(example_batch['target_text'],pad_to_max_length=True,max_length=64, truncation=True)

  labels=torch.tensor(target_encodings['input_ids'])
  print(len(labels))
  decoder_input_ids=shift_tokens_right(labels,model.config.pad_token_id)
  labels[labels[:,:]==model.config.pad_token_id]=-100
  encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': decoder_input_ids,
        'labels': labels,
    }
  return encodings

In [7]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, SequentialSampler,TensorDataset

In [8]:
class qaDataset(Dataset):
    def __init__(self, in_id, labels, de_id, atten_mask):
        self.input_ids = in_id
        self.labels=labels
        self.decoder_input_id=de_id
        self.attention_mask=atten_mask

    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        input_id = self.input_ids[idx]
        labels=self.labels[idx]
        decoder_input_id=self.decoder_input_id[idx]
        attention_mask=self.attention_mask[idx]
        sample = {'input_ids': input_id,
                  'attention_mask': attention_mask,
                  'decoder_input_ids': decoder_input_id,
                  'labels': labels}
        return sample

In [9]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import Trainer, TrainingArguments

In [10]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [11]:
q_train = pd.read_csv("/content/drive/MyDrive/newqa_train.csv")

In [12]:
q_test = pd.read_csv("/content/drive/MyDrive/newqa_test.csv")

In [13]:
input_encodings=tokenizer.batch_encode_plus([t for t in q_train['in_text']],pad_to_max_length=True, max_length=1024, truncation=True,return_tensors='pt')
target_encodings=tokenizer.batch_encode_plus([str(t) for t in q_train['out_text']],pad_to_max_length=True, max_length=64, truncation=True,return_tensors='pt')



In [14]:
labels=target_encodings['input_ids']
decoder_input_ids=shift_tokens_right(labels,model.config.pad_token_id)
labels[labels[:,:]==model.config.pad_token_id]=-100
encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': decoder_input_ids,
        'labels': labels,
}

In [15]:
train_dataset=qaDataset(encodings['input_ids'],
                        encodings['labels'],
                        encodings['decoder_input_ids'],
                        encodings['attention_mask'])

In [16]:
t_input_encodings=tokenizer.batch_encode_plus([t for t in q_test['in_text']],pad_to_max_length=True, max_length=1024, truncation=True,return_tensors='pt')
t_target_encodings=tokenizer.batch_encode_plus([str(t) for t in q_test['out_text']],pad_to_max_length=True, max_length=64, truncation=True,return_tensors='pt')



In [17]:
t_labels=t_target_encodings['input_ids']
t_decoder_input_ids=shift_tokens_right(t_labels,model.config.pad_token_id)
t_labels[t_labels[:,:]==model.config.pad_token_id]=-100
t_encodings = {
        'input_ids': t_input_encodings['input_ids'],
        'attention_mask': t_input_encodings['attention_mask'],
        'decoder_input_ids': t_decoder_input_ids,
        'labels': t_labels,
}

In [18]:
test_dataset=qaDataset(t_encodings['input_ids'],
                       t_encodings['labels'],
                       t_encodings['decoder_input_ids'],
                       t_encodings['attention_mask'])

In [19]:
training_args=TrainingArguments(
    output_dir='./model/bart_generator',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
)


In [20]:
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 14011
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 14011


Step,Training Loss
500,4.311
1000,4.1324
1500,4.2154
2000,3.9078
2500,3.799
3000,3.4773
3500,3.4328
4000,3.3364
4500,3.3131
5000,3.4422


Saving model checkpoint to ./model/bart_generator/checkpoint-500
Configuration saved in ./model/bart_generator/checkpoint-500/config.json
Model weights saved in ./model/bart_generator/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./model/bart_generator/checkpoint-1000
Configuration saved in ./model/bart_generator/checkpoint-1000/config.json
Model weights saved in ./model/bart_generator/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./model/bart_generator/checkpoint-1500
Configuration saved in ./model/bart_generator/checkpoint-1500/config.json
Model weights saved in ./model/bart_generator/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./model/bart_generator/checkpoint-2000
Configuration saved in ./model/bart_generator/checkpoint-2000/config.json
Model weights saved in ./model/bart_generator/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./model/bart_generator/checkpoint-2500
Configuration saved in ./model/bart_generator/checkpoint-

TrainOutput(global_step=14011, training_loss=3.02092201767093, metrics={'train_runtime': 10306.9053, 'train_samples_per_second': 1.359, 'train_steps_per_second': 1.359, 'total_flos': 3.2261008991453184e+16, 'train_loss': 3.02092201767093, 'epoch': 1.0})

In [22]:
torch.save(model.state_dict(), '/content/drive/MyDrive/trained_BART.pt')

In [23]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 4664
  Batch size = 1


{'epoch': 1.0,
 'eval_loss': 2.4169747829437256,
 'eval_runtime': 775.8637,
 'eval_samples_per_second': 6.011,
 'eval_steps_per_second': 6.011}

In [24]:
test_qa=pd.read_csv('/content/drive/MyDrive/test_qa.csv')

In [25]:
tokenized_test=tokenizer([t for t in test_qa['X']],truncation=True,padding=True, return_tensors='pt')

In [26]:
tokenized_test=tokenized_test.to(device)

In [27]:
tokenized_test['input_ids'][0]

tensor([   0, 2264,   16,   10, 6929,  116,    2,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

In [61]:
to_test=tokenized_test['input_ids'][0]
to_test=to_test.unsqueeze(dim=0)
to_test.shape

torch.Size([1, 181])

In [67]:
test_qa['Y'][1]

'avg_pool3d'

In [28]:
import os

In [34]:
os.makedirs("/content/drive/MyDrive/dataset/predictions", exist_ok=True)
with open(f"/content/drive/MyDrive/dataset/predictions/predictions_4.txt", "w") as f:
  pred_model=[]
  for i,t in enumerate(tokenized_test['input_ids']):
    to_test=t
    to_test=to_test.unsqueeze(dim=0)
    truth=test_qa['Y'][i]
    preds = model.generate(to_test,num_beams=4, max_length=10, early_stopping=True)
    question=test_qa['X'][i]
    f.write("Quetion : " + str(question) + "\n\n")

    f.write("Truth: "+ truth + "\n\n")
    pred=str([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in preds])
    pred_model.append(pred)
    f.write("Prediction: ")
    f.write(pred)
    f.write("\n ________________________________________________________________________________\n")


In [36]:
torch.save(pred_model,'/content/drive/MyDrive/pred_model.pt')

In [37]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.10-py3-none-any.whl (59 kB)
[?25l[K     |█████▌                          | 10 kB 34.2 MB/s eta 0:00:01[K     |███████████                     | 20 kB 33.6 MB/s eta 0:00:01[K     |████████████████▍               | 30 kB 17.2 MB/s eta 0:00:01[K     |██████████████████████          | 40 kB 16.3 MB/s eta 0:00:01[K     |███████████████████████████▍    | 51 kB 8.7 MB/s eta 0:00:01[K     |████████████████████████████████| 59 kB 4.6 MB/s 
Installing collected packages: bert-score
Successfully installed bert-score-0.3.10


In [38]:
from bert_score import score

cands = [line.strip() for line in test_qa['Y']]

refs = [line.strip() for line in pred_model]

(P, R, F), hashname = score(cands, refs, lang="en", return_hash=True)
print(f"{hashname}: P={P.mean().item():.6f} R={R.mean().item():.6f} F={F.mean().item():.6f}")

Could not locate the tokenizer configuration file, will try to use the model config instead.
https://huggingface.co/roberta-large/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp3pi1j7bg


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
creating metadata file for /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "h

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
creating metadata file for /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
https://huggingface.co/roberta-large/resolve/main/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpxhu2muqj


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/merges.txt in cache at /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
creating metadata file for /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
https://huggingface.co/roberta-large/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpbe1ma8d2


Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
creating metadata file for /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-large/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-large/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/robe

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352
creating metadata file for /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352
loading weights file https://huggingface.co/roberta-large/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are ini

roberta-large_L17_no-idf_version=0.3.10(hug_trans=4.10.2): P=0.825757 R=0.808534 F=0.816605


In [31]:
to_test=tokenized_test['input_ids'][0]
to_test=to_test.unsqueeze(dim=0)

preds = model.generate(to_test,num_beams=4, max_length=10, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=False, clean_up_tokenization_spaces=False) for g in preds])


['</s><s><s><s>torch.nn.</s>']


In [65]:
print([tokenizer.decode(g, skip_special_tokens=False, clean_up_tokenization_spaces=False) for g in preds])

['</s><s>Label</s>']


In [63]:
preds.shape

torch.Size([1, 4])

In [64]:
preds

tensor([[    2,     0, 47895,     2]], device='cuda:0')