In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [2]:
with open('eng-slg.txt', 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()

source_texts, target_texts = [], []
for line in lines:
    source, target = line.split('@')
    source_texts.append(source.strip())
    target_texts.append(target.strip())
    
print(source_texts[0:10])
print(target_texts[0:10])

['membership of parliament see minutes', 'approval of minutes of previous sitting see minutes', 'membership of parliament see minutes', 'verification of credentials see minutes', 'documents received see minutes', 'written statements and oral questions tabling see minutes', 'petitions see minutes', 'texts of agreements forwarded by the council see minutes', "action taken on parliament's resolutions see minutes", 'agenda for next sitting see minutes']
['MEMBERSHIP PARLIAMENT SEE MINUTE', 'APPROVAL MINUTE DESC-PREVIOUS SIT SEE MINUTE', 'MEMBERSHIP PARLIAMENT SEE MINUTE', 'VERIFICATION CREDENTIALS SEE MINUTE', 'DOCUMENT RECEIVE SEE MINUTE', 'WRITE STATEMENT AND DESC-ORAL QUESTION TABLE SEE MINUTE', 'PETITION SEE MINUTE', 'TEXT AGREEMENT DESC-FORWARD BY COUNCIL SEE MINUTE', 'ACTION TAKE ON PARLIAMENT X-POSS RESOLUTION SEE MINUTE', 'AGENDA FOR NEXT SIT SEE MINUTE']


In [3]:
input_encodings = tokenizer(source_texts, truncation=True, padding=True)
target_encodings = tokenizer(target_texts, truncation=True, padding=True)

In [4]:
import torch

class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, target_encodings):
        self.input_encodings = input_encodings
        self.target_encodings = target_encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.input_encodings.items()}
        item['labels'] = torch.tensor(self.target_encodings['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.input_encodings.input_ids)

dataset = TranslationDataset(input_encodings, target_encodings)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset,               # training dataset
)

trainer.train()



In [None]:
model.save_pretrained('./my_bart_model')