In [None]:
import numpy as np
import pandas as pd

import torch
import transformers

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
train.head()

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer

In [None]:
from pprint import pprint

print(tokenizer(train['excerpt'][0]))

In [None]:
model = transformers.BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=1)

In [None]:
model.config

In [None]:
model

In [None]:
model.bert.embeddings.word_embeddings(torch.tensor([[101, 1332, 1103, 1685, 1234, 1608, 1106, 1103, 20511]]))#.shape

In [None]:
model.bert.embeddings.position_embeddings(torch.tensor([list(range(512))]))#.shape

In [None]:
model.bert.embeddings.token_type_embeddings(torch.tensor([list(range(2))]))#.shape

In [None]:
model.bert.embeddings(torch.tensor([[101, 1332, 1103, 1685, 1234, 1608, 1106, 1103, 20511]]))#.shape

In [None]:
model.bert.encoder.layer[0]#(model.bert.embeddings.word_embeddings(torch.tensor([[101, 1332, 1103, 1685, 1234, 1608, 1106, 1103, 20511]])))[0].shape#.last_hidden_state

In [None]:
model.bert#(torch.tensor([[101, 1332, 1103, 1685, 1234, 1608, 1106, 1103, 20511]])).shape

In [None]:
model

In [None]:
model(torch.tensor([[1,23,4]]), labels=torch.tensor([[1]]))

In [None]:
from transformers import AutoTokenizer

tokenizer_xlnet = AutoTokenizer.from_pretrained("xlnet-base-cased")

tokenizer_xlnet

In [None]:
tokenizer_xlnet(["text", "text2"])

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class TextData(Dataset):
    def __init__(self, text, labels, max_len=250):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.tokenizer_xlnet = tokenizer_xlnet
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        tokenized_text = tokenizer(
            self.text[item].replace('\n', ''), max_length=self.max_len, truncation=True, 
            return_attention_mask=True, return_token_type_ids=True)
        
        tokenized_text2 = tokenizer_xlnet(
            self.text[item].replace('\n', ''), max_length=self.max_len, truncation=True, 
            return_attention_mask=True, return_token_type_ids=True)
        
        padding_length = self.max_len - len(tokenized_text['input_ids'])
        padding_length2 = self.max_len - len(tokenized_text2['input_ids'])
        
        return {
            'input_ids':torch.tensor(tokenized_text['input_ids'] + ([0] * padding_length), dtype=torch.long),
            'token_type_ids':torch.tensor(tokenized_text['token_type_ids'] + ([0] * padding_length), dtype=torch.long),
            'attention_mask':torch.tensor(tokenized_text['attention_mask'] + ([0] * padding_length), dtype=torch.long),
            'input_ids2':torch.tensor(tokenized_text2['input_ids'] + ([0] * padding_length2), dtype=torch.long),
            'token_type_ids2':torch.tensor(tokenized_text2['token_type_ids'] + ([0] * padding_length2), dtype=torch.long),
            'attention_mask2':torch.tensor(tokenized_text2['attention_mask'] + ([0] * padding_length2), dtype=torch.long),
            'label':torch.tensor(self.labels[item], dtype=torch.double),
        }

In [None]:
train_dataset = TextData(train.loc[:2000, 'excerpt'].values, train.loc[:2000, 'target'].values)
valid_dataset = TextData(train.loc[2000:, 'excerpt'].values, train.loc[2000:, 'target'].values)

In [None]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
valid_dataloader = DataLoader(valid_dataset, batch_size=16)

In [None]:
next(iter(train_dataloader))

In [None]:
from transformers import AutoModel

AutoModel.from_pretrained('bert-base-cased', output_hidden_states=False)

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained('xlnet-base-cased', output_hidden_states=False)

In [None]:
model.config.d_model

In [None]:
model(torch.tensor([[1,2,3]])).last_hidden_state[:,-1,:].shape

In [None]:
from transformers import AutoModel

class RegressionModel(torch.nn.Module):
    
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-cased', output_hidden_states=False)
        self.xlnet = AutoModel.from_pretrained('xlnet-base-cased', output_hidden_states=False)
        self.dropout = torch.nn.Dropout(0.1)
        self.regressor = torch.nn.Linear(self.bert.config.hidden_size, 1)
        #self.relu = torch.nn.Tanh()
        self.predictor = torch.nn.Linear(2, 1)
        
    def forward(self, input_ids, attention_mask, token_type_ids, input_ids2, attention_mask2, token_type_ids2, label=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        
        outputs2 = self.xlnet(
            input_ids2,
            attention_mask=attention_mask2,
            token_type_ids=token_type_ids2,
        )
        
        sequence_output = torch.cat((self.regressor(self.dropout(outputs[1])), 
                                     self.regressor(self.dropout(outputs2.last_hidden_state[:,-1,:]))), 1)
        
        #sequence_output = torch.cat((outputs[1], outputs2.last_hidden_state[:,-1,:]), 1)
        
        #logits = self.regressor(self.dropout(sequence_output))
        logits = self.predictor(sequence_output)
        
        loss = None
        if label is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(label.dtype)
            loss = torch.sqrt(loss_fn(logits, label.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

In [None]:
device = "cuda"

model = RegressionModel().to(device)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    
    model.train()
    
    losses = []
    
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        losses.append(loss.item())
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    print(f"train RMSE: {np.array(losses).mean()}")
        
    model.eval()
    
    losses = []
    
    for batch in valid_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs[0].item()
        
        losses.append(loss)
        
    print(f"valid RMSE: {np.array(losses).mean()}")

In [None]:
del model

In [None]:
torch.cuda.empty_cache()