### Training Notebook
- https://www.kaggle.com/code/ravaghi/automated-essay-scoring-debertav3-training

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    AutoTokenizer,
    Trainer,
    set_seed
)
from tokenizers import AddedToken
from datasets import Dataset
import pandas as pd
import numpy as np
import warnings
import random
import torch
import glob
import os
import re

warnings.simplefilter('ignore')

2024-05-01 21:32:54.080279: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-01 21:32:54.080383: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-01 21:32:54.225519: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
SEED = 27

CHECKPOINT_PATH = '/kaggle/input/automated-essay-scoring-pretrained-debertav3/deberta-v3-xsmall/'
MAX_TOKEN_LENGTH = 1024

In [3]:
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
set_seed(SEED)

In [4]:
test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [5]:
def sort_key(path):
    match = re.search(r'fold(\d+)', path)
    if match:
        return int(match.group(1))
    else:
        return 0

checkpoints = glob.glob(CHECKPOINT_PATH + '*fold*/*checkpoint*')
checkpoints.sort(key=sort_key)

In [6]:
class Tokenizer:
    def __init__(self, test_df, model_name, max_length=MAX_TOKEN_LENGTH):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.add_tokens([AddedToken('\n', normalized=False)])
        self.test_df = test_df
        self.max_token_length = max_length

    def tokenizer_fn(self, sample):
        return self.tokenizer(
            sample['full_text'],
            truncation=True,
            max_length=self.max_token_length
        )

    def tokenize(self):
        test_ds = Dataset.from_pandas(self.test_df)

        tokenized_train = test_ds.map(self.tokenizer_fn, batched=True).remove_columns(['essay_id', 'full_text'])
        return tokenized_train, self.tokenizer

In [7]:
tmp_preds = []
for i, checkpoint in enumerate(checkpoints):
    tokenized_test, tokenizer = Tokenizer(test, checkpoint).tokenize()

    model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir='.',
        per_device_eval_batch_size=1,
        report_to='none',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    fold_preds = trainer.predict(tokenized_test).predictions
    tmp_preds.append(fold_preds)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [8]:
preds = np.mean(tmp_preds, axis=0).squeeze()
preds = preds.clip(0, 5).round(0) + 1
preds = preds.astype(int)

In [9]:
submission = pd.DataFrame({
    'essay_id': test['essay_id'].values,
    'score': preds
})
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,essay_id,score
0,000d118,2
1,000fe60,3
2,001ab80,5


# Credits
This work is based on the following notebooks:
- https://www.kaggle.com/code/cdeotte/deberta-v3-small-starter-cv-0-820-lb-0-800?scriptVersionId=174239814
- https://www.kaggle.com/code/idv2005/deberta-baseline-train
- https://www.kaggle.com/code/idv2005/deberta-baseline-inference
- https://www.kaggle.com/code/hashidoyuto/deberta-baseline-aes2-0-train
- https://www.kaggle.com/code/hashidoyuto/deberta-5fold-infer-aes2-0?scriptVersionId=173129662

And the following discussion posts:
- https://www.kaggle.com/competitions/learning-agency-lab-automated-essay-scoring-2/discussion/498571
- https://www.kaggle.com/competitions/learning-agency-lab-automated-essay-scoring-2/discussion/497832