# ライブラリのインストール＆インポート

In [None]:
!pip uninstall -y pyarrow
!pip install -U datasets pyarrow transformers wandb

In [None]:
import gc
import os
import shutil
import warnings

import datasets
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import trange
from transformers import (
    RobertaPreTrainedModel,
    RobertaTokenizerFast,
    RobertaModel,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed
)
from transformers.modeling_outputs import SequenceClassifierOutput
import yaml

warnings.filterwarnings("ignore")

# データセット作成

In [None]:
def create_df(path):
    df = pd.read_csv(path)
    if "target" in df.columns:
        df = df.rename(columns={"target": "labels"})
    return df

def create_dataset(df, tokenizer, tokenizer_kwargs=None):
    source_col = "excerpt"
    
    def tokenize_function(example, tokenizer, tokenizer_kwargs=None):
        return tokenizer(example[source_col], **tokenizer_kwargs)
    
    if "labels" in df.columns:
        df = df[[source_col, "labels"]]
    else:
        df = df[[source_col]]
        
    dataset = datasets.Dataset.from_pandas(df)
    if tokenizer_kwargs is None:
        tokenizer_kwargs = {}
    dataset = dataset.map(
        tokenize_function,
        batched=True, 
        remove_columns=[source_col], 
        fn_kwargs={"tokenizer": tokenizer, "tokenizer_kwargs": tokenizer_kwargs}
    )
    return dataset

# モデル

In [None]:
class RobertaForSequenceRegression(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        config.hidden_dropout_prob = 0.0
        config.layer_norm_eps = 1e-7
        config.num_labels = 1
        config.problem_type = "regression"
        self.num_labels = config.num_labels
        self.config = config
        
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.regressor = RobertaRegressionHead(config)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None
    ):
        assert self.config.problem_type == "regression" and self.num_labels == 1
        
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        last_hidden_states = outputs[0]
        logits = self.regressor(last_hidden_states, attention_mask)
        
        loss = None
        if labels is not None:
            loss = torch.sqrt(F.mse_loss(logits.view(-1), labels.view(-1)))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

class RobertaRegressionHead(nn.Module):
    """Head for sentence-level regression tasks."""
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, last_hidden_states, attention_mask):
        x = mean_pooling(last_hidden_states, attention_mask)
        x = self.dense(x)
        x = self.activation(x)
        x = self.layer_norm(x)
        x = self.out_proj(x)
        return x    

@torch.jit.script
def mean_pooling(last_hidden_state, attention_mask):
    attention_mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size())
    sum_hidden_state = torch.sum(last_hidden_state * attention_mask, 1)
    sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)
    embeddings = sum_hidden_state / sum_mask
    return embeddings

# その他関数

In [None]:
class ContinuousStratifiedKFold(StratifiedKFold):
    def split(self, X, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(X, bins, groups)

def root_mean_squared_error(eval_pred):
    logits, labels = eval_pred
    logits = logits.squeeze()
    return {"rmse": mean_squared_error(labels, logits, squared=False)}

# 学習

In [None]:
config_str = """
path:
    train_path: "../input/commonlitreadabilityprize/train.csv"
    test_path: "../input/commonlitreadabilityprize/test.csv"
    sample_submission_path: "../input/commonlitreadabilityprize/sample_submission.csv"

model_name: "roberta-base"

tokenizer:
    padding: "do_not_pad"
    truncation: True
    max_length: 256

trainer:
    evaluation_strategy: "steps"
    per_device_train_batch_size: 16
    per_device_eval_batch_size: 16
    learning_rate: 2.0e-5
    weight_decay: 0.01
    num_train_epochs: 5
    lr_scheduler_type: "linear"
    warmup_steps: 10
    log_level: "warning"
    logging_strategy: "steps"
    logging_steps: 10
    save_strategy: "steps"
    save_steps: 10
    save_total_limit: 1
    fp16: False
    eval_steps: 10
    dataloader_num_workers: 2
    load_best_model_at_end: True
    metric_for_best_model: "eval_loss"
    greater_is_better: False

wandb:
    api_key: "your_api_key"
    project: "Kaggle_CommonLit"

early_stopping_patience: 30

seed: 42
"""

config = yaml.safe_load(config_str)

In [None]:
torch.cuda.empty_cache()
gc.collect()

os.environ["WANDB_API_KEY"] = config["wandb"]["api_key"]
os.environ["WANDB_PROJECT"] = config["wandb"]["project"]

results = []
skf = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=config["seed"])
train_valid_df = create_df(config["path"]["train_path"])
for fold, (train_index, valid_index) in enumerate(skf.split(train_valid_df, train_valid_df["labels"])):
    print(f"= fold {fold}", "="*80)
    set_seed(config["seed"]+fold)
    
    tokenizer = RobertaTokenizerFast.from_pretrained(config["model_name"])
    
    train_df = train_valid_df.loc[train_index].reset_index(drop=True)
    valid_df = train_valid_df.loc[valid_index].reset_index(drop=True)

    train_dataset = create_dataset(train_df, tokenizer, tokenizer_kwargs=config["tokenizer"])
    valid_dataset = create_dataset(valid_df, tokenizer, tokenizer_kwargs=config["tokenizer"])

    model = RobertaForSequenceRegression.from_pretrained(config["model_name"])
    
    early_stopping_callback = EarlyStoppingCallback(config["early_stopping_patience"])
    
    temp_dir = "temp"
    integration = 'wandb' if fold == 0 else 'none'
    training_args = TrainingArguments(
        output_dir=temp_dir,
        seed=config["seed"]+fold,
        run_name=config["model_name"],
        report_to=integration,
        **config["trainer"]
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=root_mean_squared_error,
        callbacks=[early_stopping_callback]
    )
    
    trainer.train()
    
    result = trainer.evaluate(valid_dataset)
    results.append(result)
    
    shutil.rmtree(temp_dir)
    os.makedirs(f"fold_{fold}", exist_ok=True)
    trainer.save_model(f"fold_{fold}")

# 推論

In [None]:
test_df = create_df(config["path"]["test_path"])
sample_submission_df = pd.read_csv(config["path"]["sample_submissioin_path"])


for fold in trange(5):
    model_path = f"fold_{fold}"
    tokenizer = RobertaTokenizerFast.from_pretrained(model_path)
    test_dataset = create_dataset(test_df, tokenizer, config["tokenizer"])
    model = RobertaForSequenceRegression.from_pretrained(model_path)
    trainer = Trainer(model=model, tokenizer=tokenizer)
    prediction_output = trainer.predict(test_dataset)
    predictions.append(prediction_output.predictions)
predictions = np.stack(predictions)

In [None]:
final_prediction = np.mean(predictions, axis=0)
sample_submission_df["target"] = final_prediction
sample_submission_df.to_csv("submission.csv", index=False)

In [None]:
sample_submission_df

In [None]:
!head ./submission.csv