In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForMultipleChoice
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, average_precision_score

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


In [3]:
train_data = pd.read_csv('train.csv')  # Update with your actual data file
test_data = pd.read_csv('test.csv')    # Update with your actual data file

In [25]:
train_ds = Dataset.from_pandas(train_data)

Downloading (…)okenizer_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29.0/29.0 [00:00<00:00, 37.3kB/s]
Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 2.65MB/s]
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 213k/213k [00:00<00:00, 8.67MB/s]
Downloading (…)/main/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 436k/436k [00:00<00:00, 7.39MB/s]


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [26]:
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # tokenizer will turn text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 323.82 examples/s]


In [27]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [29]:
model = AutoModelForMultipleChoice.from_pretrained('bert-base-cased')

Downloading pytorch_model.bin: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 436M/436M [00:05<00:00, 79.5MB/s]
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing

In [30]:
model_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none'
)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.43236
2,No log,1.212898
3,No log,1.037825


TrainOutput(global_step=150, training_loss=1.4346463012695312, metrics={'train_runtime': 315.5364, 'train_samples_per_second': 1.902, 'train_steps_per_second': 0.475, 'total_flos': 134699317591920.0, 'train_loss': 1.4346463012695312, 'epoch': 3.0})

In [33]:
predictions = trainer.predict(tokenized_train_ds)

In [34]:
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

predictions_to_map_output(predictions.predictions)

array(['D B E', 'D A B', 'A C D', 'A C B', 'D E A', 'C B A', 'A C B',
       'B D E', 'C B D', 'A C E', 'E B A', 'D A B', 'C B A', 'E D A',
       'B A E', 'B C A', 'E B A', 'E B A', 'A D E', 'E A D', 'B D C',
       'D E C', 'C A B', 'C A B', 'E A B', 'E D C', 'A D C', 'D C B',
       'E B A', 'C B E', 'B D E', 'E B D', 'E B D', 'D E B', 'E C D',
       'B A E', 'A D E', 'A D B', 'E D A', 'E A C', 'E B D', 'B D E',
       'B C A', 'D C E', 'D C B', 'A B D', 'B C A', 'C E B', 'B E D',
       'B A E', 'B D E', 'E D B', 'C A E', 'A C E', 'B A D', 'B E D',
       'C A D', 'C B A', 'D A E', 'B A C', 'B D E', 'E C B', 'A C D',
       'C A B', 'A D E', 'E C D', 'C D A', 'E B D', 'D C E', 'D E C',
       'C B A', 'A E B', 'D A C', 'B D C', 'D C E', 'C E B', 'B A C',
       'B C E', 'C D B', 'A E D', 'D C E', 'A E C', 'C E D', 'A C B',
       'C B D', 'C B E', 'D E B', 'D B C', 'E A C', 'D B C', 'D B A',
       'B D C', 'E B C', 'E D B', 'D E B', 'C A D', 'C D A', 'D B E',
       'A C E', 'C D

In [35]:
test_data['answer'] = 'A'
test_ds = Dataset.from_pandas(test_data)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 1448.92 examples/s]


In [36]:
test_predictions = trainer.predict(tokenized_test_ds)

In [38]:
submission_df = test_data[['id']]
submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)
submission_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)


Unnamed: 0,id,prediction
0,0,D B E
1,1,D A B
2,2,A C D
3,3,A C B
4,4,D E A


In [39]:
submission_df.to_csv('submission.csv', index=False)