In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/haik-24-question-answering/sample_submission.csv
/kaggle/input/haik-24-question-answering/train.json
/kaggle/input/haik-24-question-answering/test.json


# **RoBert**

In [15]:
import json
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMultipleChoice, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Load training data
with open('/kaggle/input/haik-24-question-answering/train.json', 'r') as f:
    train_data = json.load(f)

def pad_options(options, target_length=5, placeholder="No answer"):
    return options + [placeholder] * (target_length - len(options))

# Apply padding to all items in train_data
for item in train_data:
    item['options'] = pad_options(item['options'])

# Verify the result
for item in train_data:
    assert len(item['options']) == 5, f"Question {item['question']} does not have 5 options"

print("All questions now have exactly 5 options.")


All questions now have exactly 5 options.


In [1]:
import json
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMultipleChoice, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Load training data
with open('/kaggle/input/haik-24-question-answering/train.json', 'r') as f:
    train_data = json.load(f)

def pad_options(options, target_length=5, placeholder="No answer"):
    return options + [placeholder] * (target_length - len(options))

# Apply padding to all items in train_data
for item in train_data:
    item['options'] = pad_options(item['options'])

# Verify the result
for item in train_data:
    assert len(item['options']) == 5, f"Question {item['question']} does not have 5 options"

print("All questions now have exactly 5 options.") 

with open('/kaggle/input/haik-24-question-answering/test.json', 'r') as f:
    test_data = json.load(f)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def preprocess_data(data):
    processed_data = []
    for item in data:
        question_key = next(key for key in item.keys() if key.startswith('question '))
        processed_item = {
            'question': item[question_key],
            'options': item['options'],
            'label': item.get('label', None)  # None for test data
        }
        processed_data.append(processed_item)
    return processed_data

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        options = item['options']
        
        inputs = self.tokenizer(
            [question] * len(options),
            options,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        
        if item['label'] is not None:
            inputs['labels'] = torch.tensor(item['label'] - 1)  # Subtract 1 as labels are 1-indexed
        
        return inputs

# Split train data into train and validation
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

train_dataset = QADataset(train_data, tokenizer)
val_dataset = QADataset(val_data, tokenizer)
test_dataset = QADataset(test_data, tokenizer)

2024-06-28 11:55:17.933652: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-28 11:55:17.933756: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-28 11:55:18.068150: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


All questions now have exactly 5 options.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
# Initialize model
model = AutoModelForMultipleChoice.from_pretrained("roberta-base")

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return {"accuracy": (predictions == eval_pred.label_ids).astype(np.float32).mean().item()}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
100,1.5957,1.603777,0.375
200,1.4241,1.354814,0.4025
300,1.3506,1.334407,0.43375
400,1.4315,1.388075,0.39875
500,1.3905,1.311556,0.43
600,1.4538,1.300339,0.45125
700,1.4026,1.291692,0.49
800,1.3763,1.267532,0.48375
900,1.368,1.265761,0.4975




TrainOutput(global_step=900, training_loss=1.3960234525468613, metrics={'train_runtime': 2292.4092, 'train_samples_per_second': 3.141, 'train_steps_per_second': 0.393, 'total_flos': 9471912947712000.0, 'train_loss': 1.3960234525468613, 'epoch': 1.0})

# **Submission**

In [3]:
# Apply padding to all items in train_data
for item in test_data:
    item['options'] = pad_options(item['options'])

# Verify the result
for item in test_data:
    assert len(item['options']) == 5, f"Question {item['question']} does not have 5 options"

print("All questions now have exactly 5 options.") 

All questions now have exactly 5 options.


In [4]:
test_dataset = QADataset(test_data, tokenizer)

In [5]:
# Predict on test set
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1) + 1  # Add 1 to make it 1-indexed

# Create submission file
submission = pd.DataFrame({
    'question': [f'question {i}' for i in range(len(test_data))],
    'label': predicted_labels
})

submission.to_csv('submission.csv', index=False)

In [None]:
# Predict on test set
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1) + 1  # Add 1 to make it 1-indexed

# Create submission file
submission = pd.DataFrame({
    'question': [f'question {i}' for i in range(len(test_data))],
    'label': predicted_labels
})

submission.to_csv('submission.csv', index=False)
