In [None]:
import gc
gc.enable()

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version 20210331 --apt-packages libomp5 libopenblas-dev
!rm -rf /kaggle/working/*.whl
!rm -rf /kaggle/working/*.py+
!pip install accelerate

# ROBERTA moDEL

In [None]:
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification
)
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()

_pretrained_model = 'roberta-base'
lr = 3e-5
epsilon = 1e-6
weight_decay = 0.001
use_bertadam = False

config = AutoConfig.from_pretrained(_pretrained_model)
model = AutoModelForSequenceClassification.from_pretrained(
    _pretrained_model, 
    config=config
)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [{
    "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
    "weight_decay": weight_decay,
    "lr": lr,
},
{
    "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
    "weight_decay": 0.0,
    "lr": lr,
}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=lr,
    eps=epsilon,
    correct_bias=not use_bertadam # bias correction step
)

del model, optimizer_grouped_parameters, optimizer
gc.collect();

In [None]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaConfig
from transformers.models.roberta.modeling_roberta import RobertaClassificationHead

_model_type = 'roberta'
_pretrained_model = 'roberta-base'
config = RobertaConfig.from_pretrained(_pretrained_model)
add_pooler = True
reinit_pooler = True

class Net(nn.Module):
    def __init__(self, config, _pretrained_model, add_pooler):
        super(Net, self).__init__()
        self.roberta = RobertaModel.from_pretrained(_pretrained_model, add_pooling_layer=add_pooler)
        self.classifier = RobertaClassificationHead(config)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)
        return logits
        
model = Net(config, _pretrained_model, add_pooler)

if reinit_pooler:
    print('Reinitializing Pooler Layer ...')
    encoder_temp = getattr(model, _model_type)
    encoder_temp.pooler.dense.weight.data.normal_(mean=0.0, std=encoder_temp.config.initializer_range)
    encoder_temp.pooler.dense.bias.data.zero_()
    for p in encoder_temp.pooler.parameters():
        p.requires_grad = True
    print('Done.!')
    
del model
gc.collect();

In [None]:
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()

reinit_layers = 2
_model_type = 'roberta'
_pretrained_model = 'roberta-base'
config = AutoConfig.from_pretrained(_pretrained_model)
config.update({'num_labels':1})
model = AutoModelForSequenceClassification.from_pretrained(_pretrained_model)

if reinit_layers > 0:
    print(f'Reinitializing Last {reinit_layers} Layers ...')
    encoder_temp = getattr(model, _model_type)
    for layer in encoder_temp.encoder.layer[-reinit_layers:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    print('Done.!')

del model
gc.collect();

In [None]:
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import logging
from transformers.models.xlnet.modeling_xlnet import XLNetRelativeAttention
logging.set_verbosity_warning()
logging.set_verbosity_error()

reinit_layers = 2
_model_type = 'xlnet'
_pretrained_model = 'xlnet-base-cased'
config = AutoConfig.from_pretrained(_pretrained_model)
config.update({'num_labels':1})
model = AutoModelForSequenceClassification.from_pretrained(_pretrained_model)

if reinit_layers > 0:
    print(f'Reinitializing Last {reinit_layers} Layers ...')
    for layer in model.transformer.layer[-reinit_layers :]:
        for module in layer.modules():
            if isinstance(module, (nn.Linear, nn.Embedding)):
                module.weight.data.normal_(mean=0.0, std=model.transformer.config.initializer_range)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
            elif isinstance(module, XLNetRelativeAttention):
                for param in [
                    module.q,
                    module.k,
                    module.v,
                    module.o,
                    module.r,
                    module.r_r_bias,
                    module.r_s_bias,
                    module.r_w_bias,
                    module.seg_embed,
                ]:
                    param.data.normal_(mean=0.0, std=model.transformer.config.initializer_range)
    print('Done.!')
    
del model
gc.collect();

In [None]:
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import logging
from transformers.models.xlnet.modeling_xlnet import XLNetRelativeAttention
logging.set_verbosity_warning()
logging.set_verbosity_error()

reinit_layers = 2
_model_type = 'bart'
_pretrained_model = 'facebook/bart-base'
config = AutoConfig.from_pretrained(_pretrained_model)
config.update({'num_labels':1})
model = AutoModelForSequenceClassification.from_pretrained(_pretrained_model)

if reinit_layers > 0:
    print(f'Reinitializing Last {reinit_layers} Layers ...')
    for layer in model.model.decoder.layers[-reinit_layers :]:
        for module in layer.modules():
            model.model._init_weights(module)
    print('Done.!')

del model
gc.collect();

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=256):
        self.excerpt = df['excerpt'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        target = torch.tensor(self.targets[idx],dtype=torch.float) 
        return encode, target
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self,path):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained(path,output_hidden_states=True)  
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [None]:
def run(fold,verbose=True):
    
    def loss_fn(outputs,targets):
        outputs = outputs.view(-1)
        targets = targets.view(-1)
        return torch.sqrt(nn.MSELoss()(outputs,targets))
    
    def train_and_evaluate_loop(train_loader,valid_loader,model, loss_fn,optimizer,epoch,fold,best_loss,valid_step=5,lr_scheduler=None):
        train_loss = 0
        for i, (inputs1,targets1) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            inputs1 = {key:val.reshape(val.shape[0],-1) for key,val in inputs1.items()}
            outputs1 = model(**inputs1)
            loss1 = loss_fn(outputs1,targets1)
            accelerator.backward(loss1)
            optimizer.step()
            
            train_loss += loss1.item()
            
            if lr_scheduler:
                lr_scheduler.step()
            
            #evaluating for every valid_step
            if (i % valid_step == 0) or (i == (len(train_loader)-1)):
                model.eval()
                valid_loss = 0
                with torch.no_grad():
                    for j, (inputs2,targets2) in enumerate(valid_loader):
                        inputs2 = {key:val.reshape(val.shape[0],-1) for key,val in inputs2.items()}
                        outputs2 = model(**inputs2)
                        loss2 = loss_fn(outputs2,targets2)
                        valid_loss += loss2.item()
                     
                    valid_loss /= len(valid_loader)
                    if valid_loss <= best_loss:
                        if verbose:                            
                            xm.master_print(f"epoch:{epoch} | Train Loss:{train_loss/(i+1)} | Validation loss:{valid_loss}")
                            xm.master_print(f"{g_}Validation loss Decreased from {best_loss} to {valid_loss}{sr_}")

                        best_loss = valid_loss
                        xm.save(model.state_dict(),f'./model{fold}/model{fold}.bin')
                        tokenizer.save_pretrained(f'./model{fold}')
                        
        return best_loss
        
    accelerator = Accelerator()
    xm.master_print(f"{accelerator.device} is used")
    x_train,x_valid = train_data.query(f"Fold != {fold}"),train_data.query(f"Fold == {fold}")

    MODEL_PATH = f'../input/clrp-roberta-base/clrp_roberta_base'
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')
    model = Model(MODEL_PATH)

    train_ds = CLRPDataset(x_train,tokenizer,config['max_len'])
    train_dl = DataLoader(train_ds,
                        batch_size = config["batch_size"],
                        shuffle=True,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    valid_ds = CLRPDataset(x_valid,tokenizer,config['max_len'])
    valid_dl = DataLoader(valid_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    optimizer = optim.AdamW(model.parameters(),lr=config['lr'],weight_decay=config['wd'])

    lr_scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps= 10 * len(train_dl))

    model,train_dl,valid_dl,optimizer,lr_scheduler = accelerator.prepare(model,train_dl,valid_dl,optimizer,lr_scheduler)

    xm.master_print(f"Fold: {fold}")
    best_loss = 9999
    for epoch in range(config["epochs"]):
        xm.master_print(f"Epoch Started:{epoch}")
        best_loss = train_and_evaluate_loop(train_dl,valid_dl,model,loss_fn,optimizer,epoch,fold,best_loss,
                                            valid_step=config['valid_step'],lr_scheduler=lr_scheduler)

In [None]:
for f in range(config['nfolds']):
    run(f)

In [None]:
layer_start = 9
pooler = WeightedLayerPooling(
    config.num_hidden_layers, 
    layer_start=layer_start, layer_weights=None
)
features.update({'all_layer_embeddings':outputs[2]})
features = pooler(features)
print("Weighted Layer Pooling Embeddings Shape: ", features['token_embeddings'].shape)

In [None]:
sequence_output = features['token_embeddings'][:, 0]
outputs = nn.Linear(config.hidden_size, 1)(sequence_output)
print("Outputs Shape: ", outputs.shape)

del model, tokenizer
gc.collect();