In [10]:
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset

# load the data
train_dataset = load_dataset('csv', data_files='train.csv')
validation_dataset = load_dataset('csv', data_files='val.csv')

train_dataset = train_dataset.rename_column("output", "training")
train_dataset = train_dataset.rename_column("input", "label")
validation_dataset = validation_dataset.rename_column("output", "training")
validation_dataset = validation_dataset.rename_column("input", "label")

# Preprocess the Data
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
'''
def preprocess_data(examples):
    # Tokenize the texts
    texts = (examples['training'])
    questions = (examples['label'])
    tokenized_examples = tokenizer(texts, questions, truncation=True, padding=True)
    return tokenized_examples

tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)
tokenized_validation_dataset = validation_dataset.map(preprocess_data, batched=True)
'''


def preprocess_data(examples):
    # Tokenize with offset_mapping
    tokenized_inputs = tokenizer(examples['label'], truncation=True, padding=True, return_offsets_mapping=True)
    start_positions = []
    end_positions = []

    for i, input_text in enumerate(examples['label']):
        answer = examples['training'][i]
        start_char = input_text.find(answer)
        end_char = start_char + len(answer)

        # Find the token index corresponding to the start and end characters of the answer
        start_token_index = None
        end_token_index = None

        for idx, (start, end) in enumerate(tokenized_inputs["offset_mapping"][i]):
            if start <= start_char and end >= start_char:
                start_token_index = idx
            if start <= end_char and end >= end_char:
                end_token_index = idx
                break

        # In case the answer is not found in the text
        if start_token_index is None or end_token_index is None:
            start_token_index = 0
            end_token_index = 0

        start_positions.append(start_token_index)
        end_positions.append(end_token_index)

    # Remove offset mappings to avoid issues during training
    tokenized_inputs.pop("offset_mapping")
    tokenized_inputs.update({'start_positions': start_positions, 'end_positions': end_positions})
    return tokenized_inputs


tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)
tokenized_validation_dataset = validation_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [3]:
print(validation_dataset)

DatasetDict({
    train: Dataset({
        features: ['feature', 'text'],
        num_rows: 4
    })
})


In [11]:
# Fine-Tuning
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"  # Evaluate at the end of each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset['train'],
    eval_dataset=tokenized_validation_dataset['train'],
)

trainer.train()

# Step 4: Save & Load the Model
model_path = "BERTdistil"
model.save_pretrained(model_path)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[codecarbon INFO @ 11:07:56] Energy consumed for RAM : 0.001050 kWh. RAM Power : 6.0 W
[codecarbon INFO @ 11:07:56] Energy consumed for all CPUs : 0.007439 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 11:07:56] 0.008489 kWh of electricity used since the beginning.
[codecarbon INFO @ 11:07:57] [setup] RAM Tracking...
[codecarbon INFO @ 11:07:57] [setup] GPU Tracking...
[codecarbon INFO @ 11:07:57] No GPU found.
[codecarbon INFO @ 11:07:57] [setup] CPU Tracking...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the envir

  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: too many dimensions 'str'