In [None]:
#========================================
# library
# ========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.metrics import mean_squared_error
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
import transformers
from transformers import RobertaModel,RobertaTokenizer
from transformers import AlbertModel,AlbertTokenizer
from transformers import XLNetModel,XLNetTokenizer,XLNetConfig
from transformers import DebertaModel, DebertaTokenizer
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification
from transformers import BartModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import MPNetModel,MPNetTokenizer
from transformers import FunnelModel,FunnelTokenizer,FunnelBaseModel
from transformers import LongformerModel, LongformerTokenizer,LongformerForSequenceClassification
import logging
import sys
from contextlib import contextmanager
import time
import random
from tqdm import tqdm
import os
import pickle
import gc

In [None]:
# ==================
# Constant
# ==================
ex = "216"
TRAIN_PATH = "../input/commonlitreadabilityprize/train.csvv"
LOGGER_PATH = f"ex{ex}.txt"
FOLD_PATH = "../input/fe001-step-1-create-folds/fe001_train_folds.csv"
MODEL_PATH_BASE = f"ex{ex}"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ===============
# Settings
# ===============
BATCH_SIZE = 4
max_len = 256
MODEL_PATH_BASE = f"ex{ex}"

deberta_v2_xxlarge_MODEL_PATH = "../input/deberta/v2-xxlarge"
deberta_v2_xxlarge_tokenizer = AutoTokenizer.from_pretrained(deberta_v2_xxlarge_MODEL_PATH)

In [None]:
class CommonLitDataset(Dataset):
    def __init__(self, excerpt, tokenizer, max_len, target=None):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.target = target

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        if self.target is not None:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
                "target" : torch.tensor(self.target[item], dtype=torch.float32)
            }
        else:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long)
            }

class deberta_v2_xxlarge_model(nn.Module):
    def __init__(self):
        super(deberta_v2_xxlarge_model, self).__init__()
        self.deberta_model =  AutoModel.from_pretrained(deberta_v2_xxlarge_MODEL_PATH, 
                                                        hidden_dropout_prob = 0,
                                                        attention_probs_dropout_prob = 0)
        
        #self.dropout = nn.Dropout(p=0.2)
        #self.ln = nn.LayerNorm(1536)
        self.out = nn.Linear(1536, 1)
    
    def forward(self, ids, mask, token_type_ids):
        # pooler
        emb = self.deberta_model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state'][:,0,:]
        #output = self.ln(emb)
        #output = self.dropout(output)
        output = self.out(emb)
        return output
    
    
def calc_loss(y_true, y_pred):
    return  np.sqrt(mean_squared_error(y_true, y_pred))
    
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True



def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER


@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')
    
    
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

In [None]:
# ================================
# Main
# ================================
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
y = train["target"]
fold_df = pd.read_csv("../input/fe001-step-1-create-folds/fe001_train_folds.csv")

In [None]:
fold_array = fold_df["kfold"].values

In [None]:
# ================================
# train
# ================================
with timer("roberta_large_model2"):
    #set_seed(SEED)
    oof = np.zeros([len(train)])
    #kf = KFold(n_splits=N_SPLITS, shuffle=SHUFFLE, random_state=SEED)
    for fold in range(3):
        x_val, y_val =train.iloc[fold_array == fold], y.iloc[fold_array == fold]
        
        # dataset
        val_ = CommonLitDataset(x_val["excerpt"].values, deberta_v2_xxlarge_tokenizer , max_len, y_val.values.reshape(-1,1))
        
        # loader
        val_loader = DataLoader(dataset=val_, batch_size=BATCH_SIZE, shuffle = False , num_workers=4)
        
        # model
        model =  deberta_v2_xxlarge_model()
        model.load_state_dict(torch.load(f"../input/commonlit-ex216/ex216_{fold}.pth"))
        model = model.to(device)
        val_losses_batch = []
        model.eval()  # switch model to the evaluation mode
        val_preds = np.ndarray((0,1))
        with torch.no_grad():  
            # Predicting on validation set
            for d in tqdm(val_loader):
                # =========================
                # data loader
                # =========================
                input_ids = d['input_ids']
                mask = d['attention_mask']
                token_type_ids = d["token_type_ids"]
                target = d["target"]

                input_ids = input_ids.to(device)
                mask = mask.to(device)
                token_type_ids = token_type_ids.to(device)
                target = target.to(device)
                output = model(input_ids, mask,token_type_ids )
                val_preds = np.concatenate([val_preds, output.detach().cpu().numpy()], axis=0)



        #val_loss = np.mean(val_losses_batch)
        val_rmse = calc_loss(y_val, val_preds)
        print(fold, val_rmse)

        oof[fold_array == fold] = val_preds.reshape(-1)
        torch.save(model.state_dict(), MODEL_PATH_BASE + f"_{fold}.pth")
        del model
        gc.collect()

In [None]:
calc_loss(y, oof)