In [None]:
import os
import datasets, transformers
import shutil

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import KFold, StratifiedKFold

import pandas as pd
import numpy as np

os.environ["WANDB_DISABLED"] = "true"

In [None]:
class CFG:
    
    input_path = '../input/uspppm-upsampled-input/train_5_folds.csv'
    model_path = '../input/deberta-v3-large/deberta-v3-large/'
    context_path = '../input/cpc-codes/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    
    fold_id = 2
    epochs = 5
    batch_size = 16
    accumulate = 2

In [None]:
train = pd.read_csv(CFG.input_path)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)

tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

In [None]:
def process(unit, eval = False):
    return {
        **tokenizer( unit['input'], unit['target']),
        'label': unit['score']
    }

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
args = TrainingArguments(
    output_dir=f"/tmp",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=CFG.learning_rate,
    per_device_train_batch_size=CFG.batch_size,
    gradient_accumulation_steps=CFG.accumulate,
    # gradient_checkpointing=True,
    num_train_epochs=CFG.epochs,
    weight_decay=CFG.weight_decay,
    metric_for_best_model="pearson",
    load_best_model_at_end=True,
    # fp16=True,
)

fold = CFG.fold_id
tr_data = train[train['fold']!=fold].reset_index(drop=True)
va_data = train[train['fold']==fold].reset_index(drop=True)
tr_ds = datasets.Dataset.from_pandas(tr_data)
tr_ds = tr_ds.map(process, remove_columns=['id', 'fold', 'target', 'score', 'input'])

va_ds = datasets.Dataset.from_pandas(va_data)
va_ds = va_ds.map(process, remove_columns=['id', 'fold', 'target', 'score', 'input'])

model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
trainer = Trainer(
    model,
    args,
    train_dataset=tr_ds,
    eval_dataset=va_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(f"uspppm_{fold}")
shutil.rmtree(f"/tmp")