## Install Necessary Libraries


In [58]:
!pip install transformers datasets accelerate torch --quiet

 ## Load the Dataset

In [59]:
from datasets import load_dataset

# Load Bengali SQuAD and English SQuAD datasets
bengali_squad = load_dataset("csebuetnlp/squad_bn")
english_squad = load_dataset("squad")

# Use a subset for faster fine-tuning
bengali_train = bengali_squad["train"].select(range(500))  # Use 500 samples
bengali_valid = bengali_squad["validation"].select(range(100))  # Use 100 samples
english_train = english_squad["train"].select(range(500))  # Use 500 samples
english_valid = english_squad["validation"].select(range(100))  # Use 100 samples


## Preprocess the Data

In [60]:
from transformers import AutoTokenizer

# Load XLM-RoBERTa tokenizer (or replace with "LLaMA" tokenizer if needed)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def preprocess_function(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    start_positions = []
    end_positions = []

    for i in range(len(examples["answers"])):
        if len(examples["answers"][i]["text"]) > 0:
            start_char = examples["answers"][i]["answer_start"][0]
            end_char = start_char + len(examples["answers"][i]["text"][0])
            start_token_index = tokenized_examples.char_to_token(i, start_char)
            end_token_index = tokenized_examples.char_to_token(i, end_char - 1)
            start_positions.append(start_token_index if start_token_index is not None else 0)
            end_positions.append(end_token_index if end_token_index is not None else 0)
        else:
            start_positions.append(0)
            end_positions.append(0)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Preprocess datasets
bengali_train_tokenized = bengali_train.map(preprocess_function, batched=True, remove_columns=bengali_train.column_names)
bengali_valid_tokenized = bengali_valid.map(preprocess_function, batched=True, remove_columns=bengali_valid.column_names)
english_train_tokenized = english_train.map(preprocess_function, batched=True, remove_columns=english_train.column_names)
english_valid_tokenized = english_valid.map(preprocess_function, batched=True, remove_columns=english_valid.column_names)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## Fine-Tune the Model

In [61]:
from transformers import AutoTokenizer

# Load XLM-RoBERTa tokenizer (or replace with "LLaMA" tokenizer if needed)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def preprocess_function(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    start_positions = []
    end_positions = []

    for i in range(len(examples["answers"])):
        if len(examples["answers"][i]["text"]) > 0:
            start_char = examples["answers"][i]["answer_start"][0]
            end_char = start_char + len(examples["answers"][i]["text"][0])
            start_token_index = tokenized_examples.char_to_token(i, start_char)
            end_token_index = tokenized_examples.char_to_token(i, end_char - 1)
            start_positions.append(start_token_index if start_token_index is not None else 0)
            end_positions.append(end_token_index if end_token_index is not None else 0)
        else:
            start_positions.append(0)
            end_positions.append(0)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Preprocess datasets
bengali_train_tokenized = bengali_train.map(preprocess_function, batched=True, remove_columns=bengali_train.column_names)
bengali_valid_tokenized = bengali_valid.map(preprocess_function, batched=True, remove_columns=bengali_valid.column_names)
english_train_tokenized = english_train.map(preprocess_function, batched=True, remove_columns=english_train.column_names)
english_valid_tokenized = english_valid.map(preprocess_function, batched=True, remove_columns=english_valid.column_names)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [62]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Load XLM-RoBERTa model for question answering
model = AutoModelForQuestionAnswering.from_pretrained("xlm-roberta-base")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
)

# Combine datasets
from datasets import concatenate_datasets
train_dataset = concatenate_datasets([bengali_train_tokenized, english_train_tokenized])
valid_dataset = concatenate_datasets([bengali_valid_tokenized, english_valid_tokenized])

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

# Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.863244
2,No log,0.865622
3,No log,2.374269


TrainOutput(global_step=375, training_loss=1.1677897135416666, metrics={'train_runtime': 192.0823, 'train_samples_per_second': 15.618, 'train_steps_per_second': 1.952, 'total_flos': 195972567552000.0, 'train_loss': 1.1677897135416666, 'epoch': 3.0})

## Save the Fine-Tuned Model

In [63]:
# Save fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_multilingual")
tokenizer.save_pretrained("./fine_tuned_multilingual")

('./fine_tuned_multilingual/tokenizer_config.json',
 './fine_tuned_multilingual/special_tokens_map.json',
 './fine_tuned_multilingual/sentencepiece.bpe.model',
 './fine_tuned_multilingual/added_tokens.json',
 './fine_tuned_multilingual/tokenizer.json')

## Perform Inference

In [64]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
qa_pipeline = pipeline("question-answering", model="./fine_tuned_multilingual", tokenizer="./fine_tuned_multilingual")

# Bangla QA
bangla_context = "বাংলাদেশের রাজধানী ঢাকা।"
bangla_question = "বাংলাদেশের রাজধানী কী?"
print(qa_pipeline(question=bangla_question, context=bangla_context))

# English QA
english_context = "The capital of Bangladesh is Dhaka."
english_question = "What is the capital of Bangladesh?"
print(qa_pipeline(question=english_question, context=english_context))


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'score': 0.2663009464740753, 'start': 19, 'end': 24, 'answer': 'ঢাকা।'}
{'score': 0.2433495968580246, 'start': 29, 'end': 35, 'answer': 'Dhaka.'}
