### Import

In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup
from transformers import logging
logging.set_verbosity_error()
from sklearn.model_selection import KFold, StratifiedKFold

import gc
gc.enable()

In [None]:
from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL
# example: print(f"{g_}Validation loss Decreased from {best_loss} to {valid_loss}{sr_}")


In [None]:
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast

# Definitions

## Model name in this notebook

In [None]:
model_path = f"Roberta_large_model_0731e"

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [None]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
import torch, time, gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

## Hyperparameters

In [None]:
NUM_FOLDS = 5
NUM_EPOCHS = 5
SEED = 21
BATCH_SIZE = 12
MAX_LEN = 248
IS_TARGET_SAMPLING = False
IS_FREEZE_EMBEDDING = True
IS_BERT_ADAM = False
IS_STRATIFIED_KFOLD = True
IS_USING_AMP = True
RANDOM_FACTOR_OF_TARGET_SAMPLING = 0.1
WEIGHT_DECAY = 0.01
EARLY_STOPPING_PATIENCE = 15
EVAL_SCHEDULE = [(0.50, 16), (0.495, 10), (0.49, 8), (0.485, 6),(0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "../input/0712-a-pretrain-large-pipeline/my_roberta_large_pretrained_0711a"
TOKENIZER_PATH = "../input/0712-a-pretrain-large-pipeline/my_roberta_large_pretrained_0711a"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
set_random_seed(SEED)

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [None]:
if IS_STRATIFIED_KFOLD:
    train_df = create_folds(train_df, num_splits=NUM_FOLDS)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

## Dataset

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False, is_target_sampling=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
         
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)    
            self.err_std = torch.tensor(df.standard_error.values, dtype=torch.float32)   
        self.is_target_sampling = is_target_sampling
        self.encoded = tokenizer(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            # 注意这里要手动把return_attention_mask打开
            return_attention_mask=True
#             return_tensors='pt'
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            if self.is_target_sampling:
                err_std = self.err_std[index]
            # 这里对std可以有多种映射，这里先只尝试std=err_std
                
                sampled_target = torch.normal(mean=target, std=RANDOM_FACTOR_OF_TARGET_SAMPLING*err_std, 
                                              size=(1,1)).item()
                sampled_target = torch.tensor([sampled_target], dtype=torch.float32)[0]
                return (input_ids, attention_mask, sampled_target)
            else:
                return (input_ids, attention_mask, target)

## Evaluation

In [None]:
def eval_mse(model, data_loader, return_type='mse'):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0
    
    val_pred = torch.tensor([]).to(DEVICE) 
    val_target = torch.tensor([]).to(DEVICE) 
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)
            
            # 把 batch的 loss都加起来
            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
            val_pred = torch.cat([val_pred, pred.flatten()])
            val_target = torch.cat([val_target, target])
                
    # 注意这里返回的是loss和n
    if return_type=='mse':
#         print('Here!!!!!!!!!!!!1',val_pred.shape, val_target.shape)
        return mse_sum / len(data_loader.dataset), val_pred, val_target
    elif return_type=='se':
        return mse_sum , len(data_loader.dataset)

## Predict

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        
            # 注意这里要把 pred移到cpu上
            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

## Training

In [None]:

def train(model, model_path, train_loader, val_loader,
          optimizer, fold_idx, scheduler=None, num_epochs=NUM_EPOCHS, 
         training_history=[], val_history=[]):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]    
    early_stopping_if_break_curr_fold = False
    
    no_improvement_count = 0
    start = time.time()
#     print(1)
    start_timer()
    for epoch in range(num_epochs):                           
        val_rmse = None         

        for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)            
            target = target.to(DEVICE)                        
            
            
            with torch.cuda.amp.autocast(enabled=IS_USING_AMP):
                model.train()
    #             print(2)
                pred = model(input_ids, attention_mask)

                mse = nn.MSELoss(reduction="mean")(pred.flatten(), target)
            optimizer.zero_grad()            
#             mse.backward()
            scaler.scale(mse).backward()
            scaler.step(optimizer)
            
#             optimizer.step()
            scale = scaler.get_scale()
            scaler.update()
            is_skipping_lr_sched = (scale != scaler.get_scale())
            if scheduler and (not is_skipping_lr_sched):
                scheduler.step()
            
            if step >= last_eval_step + eval_period:
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step
                
                temp_mse, temp_pred, temp_target = eval_mse(model, val_loader, return_type='mse')
                
                val_rmse = math.sqrt(temp_mse)                            

                print(f"Epoch: {epoch} batch_num: {batch_num} train_rmse: {math.sqrt(mse.item())}", 
                      f"val_rmse: {val_rmse:0.4}")
                
                training_history.append(mse.item())
                val_history.append(val_rmse)

                for rmse, period in EVAL_SCHEDULE:
                    if val_rmse >= rmse:
                        eval_period = period
                        break                               
                
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    # 保存 best model
                    torch.save(model.state_dict(), f'{model_path}_FOLD{fold_idx+1}.pth')
                    print(f"{g_}New best_val_rmse: {best_val_rmse:0.4}{sr_}")
                    
                    # reset the early stopping counter
                    no_improvement_count = 0
                    
                    result_CV = np.zeros(len(temp_pred)) 
                    result_CV[:] = temp_pred.flatten().to("cpu")
                    target_col = np.zeros(len(temp_target)) 
                    target_col[:] = temp_target.flatten().to("cpu")
                    result_CV_df = pd.DataFrame(np.array([result_CV, target_col]).T, columns=['pred', 'target'])
                    result_CV_df = result_CV_df.to_csv(f'oof_{model_path}_FOLD{fold_idx+1}.csv')
                    
                else:       
                    print(f"{y_}Still best_val_rmse: {best_val_rmse:0.4}{sr_}",
                          f"(from epoch {best_epoch})")     
                    
                    # update the early stopping counter
                    no_improvement_count += 1
                    if no_improvement_count >= EARLY_STOPPING_PATIENCE:
                        early_stopping_if_break_curr_fold = True
                        start = time.time()
                        step += 1
                        print(f"{c_}Earlt stopping triggered here. {best_epoch}{sr_})")  
                        break
                        
                start = time.time()
                                            
            step += 1
        if early_stopping_if_break_curr_fold:
            
            break
    end_timer_and_print("Mixed precision:")
    return best_val_rmse



## Opt

In [None]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:391]    
    attention_parameters = named_parameters[391:395]
    regressor_parameters = named_parameters[395:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group,
                       "lr": 5e-5,
                       "weight_decay": WEIGHT_DECAY})
    parameters.append({"params": regressor_group,
                       "lr": 3e-4,
                       "weight_decay": WEIGHT_DECAY})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else WEIGHT_DECAY

        lr = 1e-6

        if layer_num >= 133:        
            lr = 5e-6

        if layer_num >= 261:
            lr = 1e-5
        
        

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr,
                           "correct_bias": IS_BERT_ADAM})

    return AdamW(parameters)

## Model

In [None]:
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * abs(self.layer_start), dtype=torch.float)
            )

    def forward(self, all_layer_embeddings_input):
#         print('num',self.num_hidden_layers )
        ft_all_layers = all_layer_embeddings_input
        # these teo lines convert 'tuple of tensors' into 'tensor', and then slice on it
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]
#         print('h',all_layer_embedding.shape)
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average



In [None]:
class ArcFaceClassifier(nn.Module):
    def __init__(self, emb_size, output_classes):
        super().__init__()
        self.W = nn.Parameter(torch.Tensor(emb_size, output_classes))
        nn.init.kaiming_uniform_(self.W)
    def forward(self, x):
        # Step 1:
        x_norm = F.normalize(x)
        W_norm = F.normalize(self.W, dim=0)
        # Step 2:
        return x_norm @ W_norm

In [None]:


class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7,
                       'output_hidden_states':True})   
        
        freeze_embedding = IS_FREEZE_EMBEDDING
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config, )  
        self.roberta.base_model.embeddings.requires_grad_(not freeze_embedding)    
        
        
#         self.config_hidden_size = config.hidden_size
        
        self.layer_start = -8
        self.pooler = WeightedLayerPooling(
            config.num_hidden_layers, 
            layer_start=self.layer_start, layer_weights=None
        )
        self.arcface = ArcFaceClassifier(emb_size=512,output_classes=1)
        
        self.attention = nn.Sequential(            
            nn.Linear(1024, 512),            
            nn.Tanh(),                       
            self.arcface
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(1024, 512),
            nn.PReLU(),
            nn.Linear(512, 256),
            nn.PReLU(),
            nn.Linear(256, 256),
            nn.PReLU(),
            nn.Linear(256, 128),
            nn.PReLU(),
            nn.Linear(128, 64),
            nn.PReLU(),
            nn.Linear(64, 16),
            nn.PReLU(),
            nn.Linear(16, 1)
        )
#         self.fc = nn.Linear(config.hidden_size, 1)


    def forward(self, input_ids, attention_mask):
        
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)   
        sequence_output = self.pooler(roberta_output.hidden_states)
        weights = self.attention(sequence_output)
        context_vector = torch.sum(weights * sequence_output, dim=1)  

        return self.regressor(context_vector)

In [None]:
model = LitModel().to(DEVICE)

In [None]:
named_parameters = list(model.named_parameters())

In [None]:
[(tup[0], tup[1][0]) for tup in enumerate(named_parameters)]

In [None]:
del model
gc.collect()
torch.cuda.empty_cache()

# Training loop

In [None]:
%%time
gc.collect()



list_val_rmse = []
set_random_seed(SEED)
kfold = KFold(n_splits=NUM_FOLDS, shuffle=True)

scaler = GradScaler(enabled=IS_USING_AMP)

training_history = []
val_history = []
if IS_STRATIFIED_KFOLD:
    
    for fold in range(NUM_FOLDS):
        train_indices = train_df.index[train_df['kfold'] != fold].tolist()
        val_indices = train_df.index[train_df['kfold'] == fold].tolist()
        set_random_seed(SEED + fold)
        random.shuffle(train_indices)
        print(f"\nFold {fold+1}/{NUM_FOLDS}")

        train_dataset = LitDataset(train_df.loc[train_indices], is_target_sampling=IS_TARGET_SAMPLING)    
        # 不知道这里val用不用random target sampling，暂时不用
        val_dataset = LitDataset(train_df.loc[val_indices])    

        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                  drop_last=True, shuffle=True, num_workers=0)    
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                drop_last=False, shuffle=False, num_workers=0)    

        set_random_seed(SEED + fold)    

        model = LitModel().to(DEVICE)

        optimizer = create_optimizer(model)      
        COSINE_WARMUP_FACTOR = 0.1
        num_training_steps=NUM_EPOCHS * len(train_loader)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_training_steps=NUM_EPOCHS * len(train_loader),
            num_warmup_steps=50
#             num_warmup_steps=COSINE_WARMUP_FACTOR*num_training_steps
        
        )    

        list_val_rmse.append(train(model, model_path, train_loader,
                                   val_loader, optimizer, fold_idx=fold, scheduler=scheduler,
                                  training_history=training_history, val_history=val_history))

        del model
        gc.collect()
        torch.cuda.empty_cache()
        print("\nPerformance estimates (simple):")
        print(list_val_rmse)
        print("Mean:", np.array(list_val_rmse).mean(), ";Std:", np.array(list_val_rmse).std())

    
    
else:
    
    for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
        print(f"\nFold {fold+1}/{NUM_FOLDS}")
    #     model_path = f"Roberta_targetsampling_model_0701b"

        set_random_seed(SEED + fold)

        train_dataset = LitDataset(train_df.loc[train_indices], is_target_sampling=IS_TARGET_SAMPLING)    
        # 不知道这里val用不用random target sampling，暂时不用
        val_dataset = LitDataset(train_df.loc[val_indices])    

        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                  drop_last=True, shuffle=True, num_workers=2)    
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                drop_last=False, shuffle=False, num_workers=2)    

        set_random_seed(SEED + fold)    

        model = LitModel().to(DEVICE)

        optimizer = create_optimizer(model)                        
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_training_steps=NUM_EPOCHS * len(train_loader),
            num_warmup_steps=50)    

        list_val_rmse.append(train(model, model_path, train_loader,
                                   val_loader, optimizer, fold_idx=fold, scheduler=scheduler))

        del model
        gc.collect()

        print("\nPerformance estimates (simple):")
        print(list_val_rmse)
        print("Mean:", np.array(list_val_rmse).mean(), ";Std:", np.array(list_val_rmse).std())
    

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid")
train_history = np.array(list((enumerate(training_history)))).T
val_history = np.array(list((enumerate(val_history)))).T
plt.rcParams["figure.figsize"] = (16,10)
sns.lineplot(x=train_history[0], y=train_history[1])
sns.lineplot(x=val_history[0], y=val_history[1])
plt.plot()

# Predict submission file

In [None]:
# COMPUTE_CV = True

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
# if len(test_df)>7: COMPUTE_CV = False
# else: print('this submission notebook will compute CV score, but commit notebook will not')


submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
test_dataset = LitDataset(test_df, inference_only=True)

In [None]:
test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for index in range(len(list_val_rmse)):            
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(f'{model_path}_FOLD{index+1}.pth'))    
    model.to(DEVICE)
    
    temp_pred = predict(model, test_loader)
    print(temp_pred)
    result_sub = np.zeros(len(temp_pred)) 
    result_sub[:] = temp_pred.flatten()
    result_sub_df = pd.DataFrame(result_sub, columns=['pred'])
    result_sub_df = result_sub_df.to_csv(f'sub_{model_path}_FOLD{index+1}.csv', columns=['pred'])
    
    del model
    gc.collect()

In [None]:
for index in range(len(list_val_rmse)):            
    print(f"\nUsing {model_path}")
                        
    sub_i_df = pd.read_csv(f'sub_{model_path}_FOLD{index+1}.csv')
    submission_df.target += sub_i_df.pred/len(list_val_rmse)
submission_df

In [None]:
for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    print(val_indices.shape)

In [None]:
submission_df.to_csv("submission.csv", index=False)

# CV

In [None]:
oof_cat = pd.DataFrame([])
for index in range(len(list_val_rmse)):            
    print(f"\nUsing {model_path}")
                        
    oof_i_df = pd.read_csv(f'oof_{model_path}_FOLD{index+1}.csv')
    oof_cat = pd.concat([oof_cat,oof_i_df],ignore_index=True)
print(oof_cat.shape)

In [None]:
CV_rmse = np.sqrt(nn.MSELoss(reduction="mean")(torch.tensor(oof_cat.pred.values), 
                                               torch.tensor(oof_cat.target.values)).item())
print(f'{y_}CV_rmse: {CV_rmse:0.5}{sr_}')