In [None]:
import numpy as np 
import pandas as pd 
import os, random, sys, time, re

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as D
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

from sklearn.model_selection import StratifiedKFold, KFold

import warnings
warnings.filterwarnings('ignore')

from transformers import *
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForMaskedLM

## Directory Settings

In [None]:
# ====================================================
# Directory settings
# ====================================================
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

TRAIN_PATH = '../input/commonlitreadabilityprize'
TEST_PATH = '../input/commonlitreadabilityprize'

## CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    seed=7117
    n_folds=5
    model_name='roberta-base'
    max_sequence_length=220
    batch_size=12
    epochs=15
    lr=2.5e-5
    scheduler='ConstantScheduleWithWarmup' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts', 'ConstantScheduleWithWarmup']
    #factor=0.2 # ReduceLROnPlateau
    #patience=4 # ReduceLROnPlateau
    #eps=1e-6 # ReduceLROnPlateau
    #T_max=10 # CosineAnnealingLR
    #T_0=10 # CosineAnnealingWarmRestarts
    #n_epochs=100
    #min_lr=1e-6
    
if CFG.debug:
    CFG.epochs = 1
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

In [None]:

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
# seed=CFG.seed

# error log
sys.stderr = open('err.txt', 'w')

## Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

In [None]:
DATA_PATH = "../input/commonlitreadabilityprize/"

# model_path = '../input/distilbertbaseuncased'
# model_path = '../input/pretrained-albert-pytorch/albert-base-v2'
# model_path = '../input/roberta-transformers-pytorch/distilroberta-base'
# model_path = '../input/roberta-transformers-pytorch/roberta-base'
# model_path = '../input/bart-models-hugging-face-model-repository/bart-base'
model_path = '../input/pretrainedrobertabase'

# VOCAB_PATH = '../input/roberta-transformers-pytorch/roberta-base'
# VOCAB_PATH = '../input/pretrained-albert-pytorch/albert-base-v2'
# VOCAB_PATH = '../input/pretrained-albert-pytorch/albert-base-v1'
# VOCAB_PATH = '../input/distilbertbaseuncased'
# VOCAB_PATH = '../input/bart-models-hugging-face-model-repository/bart-base'
VOCAB_PATH = '../input/pretrainedrobertabase'

In [None]:
if CFG.model_name == 'roberta-base':
    model_name = '../input/pretrainedrobertabase'


## Data Loading, CV Split

In [None]:
train_csv = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), index_col='id')
train_csv['excerpt'] = train_csv['excerpt'].replace('\n', '')

test_csv = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'), index_col='id')

In [None]:
subm = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'), index_col='id')

y = (train_csv.target.values > 0).astype(int)
cv = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(
            '../input/roberta-transformers-pytorch/roberta-base', model_max_length=CFG.max_sequence_length)
def get_tokens(text):
    tokens = tokenizer.encode_plus(text, max_length=CFG.max_sequence_length, truncation=True, return_attention_mask=True, return_token_type_ids=True)
    return tokens
    
train_csv['token'] = train_csv.excerpt.apply(get_tokens)
test_csv['token'] = test_csv.excerpt.apply(get_tokens)

In [None]:
train_csv['token'].head()

In [None]:
test_csv['token'].head()

In [None]:
class LitDataset(D.Dataset):
    
    def __init__(self, token, target):
        self.token = token
        self.target = target
        
    def __len__(self):
        return self.token.shape[0]

    def __getitem__(self, idx):
        return torch.tensor(self.token[idx].input_ids), \
                torch.tensor(self.token[idx].attention_mask), torch.tensor(self.token[idx].token_type_ids), self.target[idx]
    
def collate_fn(batch):
    ids, attns, token_type, targets = zip(*batch)
    ids = pad_sequence(ids, batch_first=True).to(DEVICE)
    attns = pad_sequence(attns, batch_first=True).to(DEVICE)
    token_type = pad_sequence(token_type, batch_first=True).to(DEVICE)
    targets = torch.tensor(targets).float().to(DEVICE)
    return ids, attns, token_type, targets
def collate_fn_test(batch):
    ids, attns, token_type, idxs = zip(*batch)
    ids = pad_sequence(ids, batch_first=True).to(DEVICE)
    attns = pad_sequence(attns, batch_first=True).to(DEVICE)
    token_type = pad_sequence(token_type, batch_first=True).to(DEVICE)
    return idxs, ids, attns, token_type


In [None]:
class CommonLitModel(nn.Module):
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        super(CommonLitModel, self).__init__()
        self.config = config
        self.roberta = AutoModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        #self.regressor = nn.Linear(config.hidden_size*2, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None
#         labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
 
        # max-avg head
        # average_pool = torch.mean(sequence_output, 1)
        # max_pool, _ = torch.max(sequence_output, 1)
        # concat_sequence_output = torch.cat((average_pool, max_pool), 1)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
        
        return logits

In [None]:
ds = LitDataset(train_csv.token, train_csv.target)
test_ds = LitDataset(test_csv.token, test_csv.index)

tloader = D.DataLoader(test_ds, batch_size=CFG.batch_size,
                       shuffle=False, collate_fn = collate_fn_test, num_workers=0)

In [None]:
### Table for results
header = r'''
            Train         Validation
Epoch |  MSE  |  RMSE |  MSE  |  RMSE | Time, m
'''
#          Epoch         metrics            time
raw_line = '{:6d}' + '\u2502{:7.3f}'*4 + '\u2502{:6.2f}'

In [None]:
    # ====================================================
    # scheduler 
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau': # epoch
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR': # epoch
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts': # epoch
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler == 'ConstantScheduleWithWarmup':
            scheduler = get_constant_schedule_with_warmup(optimizer, 35)
        return scheduler

In [None]:
@torch.no_grad()
def validation_fn(model, loader, loss_fn):
    tloss = []
    for texts, attns, token_type, target in loader:
#         outputs = model(texts, attention_mask=attns)
        outputs = model(
            input_ids=texts,
            attention_mask=attns,
            token_type_ids=token_type
        )
        loss = loss_fn(outputs.squeeze(-1), target)
        tloss.append(loss.item())
    tloss = np.array(tloss).mean()
    return tloss

def oof_preds(ds, tloader, cv, y, epochs = CFG.epochs):
    
    loss_fn = torch.nn.MSELoss()
    
#     for train_idx, valid_idx in cv.split(range(len(ds)), y):
    for fold, (train_idx, valid_idx) in enumerate(cv.split(range(len(ds)), y)):
        
        train_ds = D.Subset(ds, train_idx)
        loader = D.DataLoader(train_ds, batch_size=CFG.batch_size,
                              shuffle=True, collate_fn = collate_fn,num_workers=0)
        
        valid_ds = D.Subset(ds, valid_idx)
        vloader = D.DataLoader(valid_ds, batch_size=CFG.batch_size,
                      shuffle=False, collate_fn = collate_fn,num_workers=0)
        
#         model = get_model.from_pretrained( 
#                           model_path, num_labels=1).to(DEVICE);
        config = AutoConfig.from_pretrained(model_path)
        config.update({'num_labels': 1})
        model = CommonLitModel(model_path, config=config)
        model = model.to(DEVICE)
        
        optimizer = optim.AdamW(model.parameters(), CFG.lr,
                                betas=(0.9, 0.999), weight_decay=1e-1)
#         scheduler = get_constant_schedule_with_warmup(optimizer, 35)
        scheduler = get_scheduler(optimizer)
        b_loss = np.inf
        
        print(header)
        for epoch in range(1, epochs+1):      
            start_time = time.time()
            tloss = []          
            model.train()
            
            for texts, attns, token_type, target in loader:
                optimizer.zero_grad()
#                 outputs = model(texts, attention_mask=attns)
                outputs = model(
                    input_ids=texts,
                    attention_mask=attns,
                    token_type_ids=token_type
                )
                loss = loss_fn(outputs.squeeze(-1), target)
                tloss.append(loss.item())
                loss.backward()
                optimizer.step()
                scheduler.step()
            tloss = np.array(tloss).mean()
            vloss = validation_fn(model, vloader, loss_fn)
            tmetric = tloss**.5
            vmetric = vloss**.5
            print(raw_line.format(epoch,tloss,tmetric,vloss,vmetric,(time.time()-start_time)/60**1))
            del loss, outputs
            
            if vmetric <= b_loss:
                b_loss = vmetric
                torch.save(model.state_dict(), f"fold{fold}_best.pth")
        
#         model = get_model.from_pretrained(model_path, num_labels=1)
        model = CommonLitModel(model_path, config=config)
        model.load_state_dict(torch.load(f"fold{fold}_best.pth"), strict=False)
        model.to(DEVICE)
        
        model.eval();
        # Get prediction for test set
        ids, preds = [], [] 
        with torch.no_grad():
            for batch_ids, texts, attn, token_type in tloader:
#                 outputs = model(texts, attention_mask=attn)
                outputs = model(
                    input_ids=texts,
                    attention_mask=attn,
                    token_type_ids=token_type
                )
                ids += batch_ids
                preds.append(outputs.detach().squeeze(-1).cpu().numpy())
            
        # Save prediction of test set
        preds = np.concatenate(preds)
        subm.loc[ids, 'target']  =  subm.loc[ids, 'target'].values + preds / N_FOLDS
        
        del model, vloader, loader, train_ds, valid_ds
        torch.cuda.empty_cache()
        

In [None]:
oof_preds(ds, tloader, cv, y, epochs = CFG.epochs)

In [None]:
subm.to_csv('submission.csv')

In [None]:
subm.head()