# BERT Fine Tuning

In [1]:
#!pip install transformers
#!pip install datasets
from datasets import load_dataset
from transformers import BertTokenizer, EncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, BertModel, BertLMHeadModel


dataset = load_dataset("aslg_pc12")

Found cached dataset aslg_pc12 (C:/Users/Idener/.cache/huggingface/datasets/aslg_pc12/default/0.0.1/7ae5d117644e44ff4d2233f27f1d5df93f429efab7b2ea84828516fcbce48fb4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
def tokenize_function(example):
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    input_tokens = tokenizer(example["gloss"], truncation=True, padding="max_length", max_length=512)
    output_tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
    return {"input_ids": input_tokens.input_ids, "attention_mask": input_tokens.attention_mask,
            "decoder_input_ids": output_tokens.input_ids, "decoder_attention_mask": output_tokens.attention_mask}

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["gloss", "text"])

Loading cached processed dataset at C:\Users\Idener\.cache\huggingface\datasets\aslg_pc12\default\0.0.1\7ae5d117644e44ff4d2233f27f1d5df93f429efab7b2ea84828516fcbce48fb4\cache-5fce574275826262_*_of_00004.arrow


In [3]:
from transformers import BertConfig, default_data_collator

#decoder_config = BertConfig.from_pretrained('bert-base-uncased', add_cross_attention=True, is_decoder=True)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
"""model_checkpoint = "bert-base-uncased"

# Load the pre-trained BERT models for the encoder and decoder
encoder = BertModel.from_pretrained(model_checkpoint)
decoder = BertLMHeadModel.from_pretrained(model_checkpoint, config=decoder_config)

# Combine the encoder and decoder to create the EncoderDecoderModel
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)"""

encoder_config = BertConfig.from_pretrained("bert-base-uncased")
decoder_config = BertConfig.from_pretrained("bert-base-uncased")
decoder_config.is_decoder = True
decoder_config.add_cross_attention = True
decoder_config.return_dict = True  

model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased", encoder_config=encoder_config, decoder_config=decoder_config)

training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="logs",
    logging_steps=100,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_total_limit=2,
    fp16=False,
    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=default_data_collator,
    tokenizer=tokenizer,
)


trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relatio



ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values,encoder_last_hidden_state. For reference, the inputs it received are input_ids,attention_mask,decoder_input_ids,decoder_attention_mask.

In [None]:
def generate_translation(model, inputs):
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    labels = inputs["labels"]

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=input_ids, labels=labels)
    return outputs.loss, outputs.logits

test_file = "test_source_lang.txt"
output_file = "translated_output.txt"

with open(test_file, "r") as f:
    test_lines = f.readlines

with open(output_file, "w", encoding="utf-8") as out_f:
    for i, test_line in enumerate(test_lines):
        translation = generate_translation(model, tokenizer, test_line.strip())
        out_f.write(translation + "\n")
        print(f"Translated line {i + 1}")

In [None]:
###KERNEL KILL
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)