In [None]:
import numpy as np
import pandas as pd

import torch
import transformers

!pip install textstat

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(42)


In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
train.head()

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(20,4))

plt.subplot(221)
train['target'].hist()
plt.subplot(222)
plt.scatter(train['target'],train['standard_error'])
plt.subplot(223)
plt.plot(train['target'])
plt.show()

#train['target'].hist(),train['target'].hist()

In [None]:
import textstat

ri=[textstat.textstat.automated_readability_index(i) for i in train['excerpt'].values ]
rf=[textstat.textstat.flesch_reading_ease(i) for i in train['excerpt'].values ]
rd=[textstat.textstat.dale_chall_readability_score_v2(i) for i in train['excerpt'].values ]


fig = plt.figure(figsize=(20,4))
plt.subplot(131)
plt.scatter(ri,train['target'])
plt.subplot(132)
plt.scatter(rf,train['target'])
plt.subplot(133)
plt.scatter(rd,train['target'])
plt.show()

# Tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextData(Dataset):
    def __init__(self, text, labels, max_len=250):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        tokenized_text = tokenizer(
            self.text[item].replace('\n', ''), max_length=self.max_len, truncation=True, 
            return_attention_mask=True, return_token_type_ids=True)
        
        padding_length = self.max_len - len(tokenized_text['input_ids'])
        
        return {
            'input_ids':torch.tensor(tokenized_text['input_ids'] + ([0] * padding_length), dtype=torch.long),
            #'token_type_ids':torch.tensor(tokenized_text['token_type_ids'] + ([0] * padding_length), dtype=torch.long),
            'attention_mask':torch.tensor(tokenized_text['attention_mask'] + ([0] * padding_length), dtype=torch.long),
            'label':torch.tensor(self.labels[item], dtype=torch.double),
        }

In [None]:
torch.initial_seed()

In [None]:
from torch.utils.data.dataset import random_split

torch.manual_seed(42)

dataset = TextData(train['excerpt'].values, train['target'].values)

train_dataset, valid_dataset = random_split(dataset, [2000,834])


In [None]:
loaders = {
    "train": DataLoader(train_dataset,shuffle=True, batch_size=16),
    "valid": DataLoader(valid_dataset, batch_size=16)
}

# Model

In [None]:
from transformers import AutoModel

class ReadModel(torch.nn.Module): 
    def __init__(self):
        super(ReadModel, self).__init__()
        self.bert = AutoModel.from_pretrained('roberta-base', output_hidden_states=False)
        self.dropout = torch.nn.Dropout(0.2)
        self.hidden = net = torch.nn.Sequential(
            torch.nn.Linear(768, 384),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(384, 128),
            torch.nn.LeakyReLU(),
        )
        self.regressor = torch.nn.Linear(128, 1)
        
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids,attention_mask)
        output = output.last_hidden_state[:, 0]
        output = self.dropout(output)
        output = self.hidden(output)
        logits = self.regressor(output)  
        return logits 

In [None]:
model = ReadModel()

for param in model.bert.embeddings.parameters():
    param.requires_grad = False

for i in range(0,10):
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = False 


# Runner

In [None]:
import catalyst
from catalyst import dl, metrics, utils
catalyst.__version__

In [None]:
import torch
from torch.nn import functional as F

class CustomRunner(dl.Runner):
    
    def predict_batch(self, batch):       
        input_ids = batch['input_ids'].T.to(self.device)
        #token_type_ids = batch['token_type_ids'].to(self.device)
        attention_mask = batch['attention_mask'].T.to(self.device)
        return self.model(input_ids, attention_mask)  #, token_type_ids
    
    def on_loader_start(self, runner):
        super().on_loader_start(runner)
        self.meters = {
            key: metrics.AdditiveValueMetric(compute_on_call=False)
            for key in ["loss", "mae"]
        }

    def handle_batch(self, batch):
        
        input_ids = batch['input_ids']
        #token_type_ids = batch['token_type_ids']
        attention_mask = batch['attention_mask']
        
        y = batch['label'].view(-1, 1).float()

        y_pred = self.model(input_ids, attention_mask).view(-1, 1).float() #, token_type_ids
        
        self.batch = {'logits': y_pred, 'target': y}
        
        loss = F.mse_loss(y_pred.view(-1), y.view(-1))

        self.batch_metrics.update({"loss": loss**0.5, "mae": F.l1_loss(y_pred, y)})
        for key in ["loss", "mae"]:
            self.meters[key].update(self.batch_metrics[key].item(), self.batch_size)

        if self.is_train_loader:
            loss.backward(retain_graph=True)
            self.optimizer.step()
            self.optimizer.zero_grad()
    
    def on_loader_end(self, runner):
        for key in ["loss", "mae"]:
            self.loader_metrics[key] = self.meters[key].compute()[0]
        super().on_loader_end(runner)

# Train

In [None]:
criterion = torch.nn.MSELoss()
#optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001)

optimizer = torch.optim.AdamW([
    {'params': model.bert.parameters(), 'lr': 0.00001},
    {'params': model.hidden.parameters(), 'lr': 0.0001},
    {'params': model.regressor.parameters(), 'lr': 0.0001}
])

runner = CustomRunner()

runner.train(
  model=model, 
  optimizer=optimizer, 
  loaders=loaders, 
  logdir="logs",
  valid_loader="valid",
  valid_metric="loss",
  num_epochs=10,
  minimize_valid_metric=True,
  verbose=True,
  timeit=False,
)

In [None]:
torch.save(runner.model.state_dict(), "n2_model.pth")
#qmodel = utils.quantize_model(model=runner.model)
#torch.save(qmodel.state_dict(), "q1_model.pth")

# Evaluate

In [None]:
test_params = {"batch_size": 128*4,
               "shuffle": False,
               "drop_last": False}


f = TextData(test['excerpt'].values,np.zeros(len(test)))

f_generator = DataLoader(f, **test_params)

In [None]:
def predict(model, data_loader):
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for item in data_loader:
            input_ids = item['input_ids'].cuda()
            attention_mask = item['attention_mask'].cuda()
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().cpu()
            index += pred.shape[0]

    return result

In [None]:
predict(runner.model,f_generator)

In [None]:
pre = pd.DataFrame({
    'id': test.loc[:, 'id'].values,
    'target': predict(runner.model,f_generator)
})


In [None]:
pre.to_csv('submission.csv', index=False)

