In [None]:
!pip install huggingface-hub
!pip install transformers

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.preprocessing import KBinsDiscretizer
import transformers
from transformers import AutoModel, BertTokenizerFast
import random
from torch.utils.tensorboard import SummaryWriter

# specify GPU
device = torch.device("cuda")

torch.manual_seed(1)
random.seed(24)
np.random.seed(42)


class regressor_stratified:
    def __init__(self,n_splits=1,group_count=10,random_state=42,strategy='quantile',val_size=0.2):
        self.group_count=group_count
        self.strategy=strategy
        self.cvkwargs=dict(n_splits=n_splits,test_size=val_size,random_state=random_state)
        self.cv=StratifiedShuffleSplit(**self.cvkwargs)
        self.discretizer=KBinsDiscretizer(n_bins=self.group_count,encode='ordinal',strategy=self.strategy)  
            
    def split(self,X,y,groups=None):
        kgroups=self.discretizer.fit_transform(y[:,None])[:,0]
        return self.cv.split(X,kgroups,groups)
    
    def get_n_splits(self,X,y,groups=None):
        return self.cv.get_n_splits(X,y,groups)

In [None]:
!ls ..
df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
df.loc[:, ['target']].plot(kind='kde')
print(df.excerpt[0])

In [None]:
# show excerpts lenghts
seq_len = [len(i.split()) for i in df.excerpt]
pd.Series(seq_len).hist(bins = 30)

In [None]:
splitter = regressor_stratified(group_count=10,random_state=42,strategy='uniform')
generator = splitter.split(df[['excerpt']], df['target'])

for train, val in generator:
    train_df = df.loc[df.index.isin(train)]
    train_df.target.plot(kind='kde')
    
    val_df = df.loc[df.index.isin(val)]
    val_df.target.plot(kind='kde')

print(len(train_df), len(val_df))
val_df.head(1)

# train_X, val_X = train_df['excerpt'], val_df['excerpt']
# train_y, val_y = train_df['target'], val_df['target']


In [None]:
from torch.utils.data import Dataset
import random


def convert_examples_to_features(text, tokenizer, max_len):

    tok = tokenizer.encode_plus(
        text, 
        max_length=max_len, 
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
    )
    return tok


class CLRDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.tolist()
        if not is_test:
            self.targets = self.data.target.tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.float),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [None]:
from transformers import AutoModel, AutoTokenizer 
model_name = "bert-large-cased" 

tokenizer = AutoTokenizer.from_pretrained(model_name)
torch.save(tokenizer, 'tokenizer.pt')

train_ds = CLRDataset(data=train_df, tokenizer=tokenizer, max_len=256)
val_ds = CLRDataset(data=val_df, tokenizer=tokenizer, max_len=256)

In [None]:
bert_model = AutoModel.from_pretrained(model_name)  

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 8

# train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_ds)
train_dataloader = DataLoader(train_ds, sampler=train_sampler, batch_size=batch_size)

# val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_ds)
val_dataloader = DataLoader(val_ds, sampler = val_sampler, batch_size=batch_size)



In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.relu =  nn.ReLU()
        self.fc1 = nn.Linear(1024,512)
        self.fc2 = nn.Linear(512,256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, sent_id, mask):
        output = self.bert(sent_id, attention_mask=mask)
        x = self.relu(self.fc1(output[1]))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
from transformers import AdamW

model = BERT_Arch(bert_model)
model = model.to(device)
optimizer = AdamW(model.parameters(),
                  lr = 2e-5)   
epochs = 15

criterion = nn.MSELoss() 


In [None]:
!nvidia-smi

In [None]:
def train():
  
  model.train()
  total_loss = 0
    
  for step,batch in enumerate(train_dataloader):
    
#     batch = [r.to(device) for r in batch]
    sent_id, mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device), 
    model.zero_grad()      
    preds = model(sent_id, mask)
    loss = torch.sqrt(criterion(preds, labels.unsqueeze(1)))
    total_loss = total_loss + loss.item()
    loss.backward()
    optimizer.step()

  avg_loss = total_loss / len(train_dataloader)

  return avg_loss


def evaluate():
  model.eval()
  total_loss = 0

  for step,batch in enumerate(val_dataloader):
#     batch = [t.to(device) for t in batch]
    sent_id, mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
    with torch.no_grad():
      preds = model(sent_id, mask)
      loss = torch.sqrt(criterion(preds,labels.unsqueeze(1)))
      total_loss = total_loss + loss.item()

  avg_loss = total_loss / len(val_dataloader) 
  return avg_loss

In [None]:
best_valid_loss = float('inf')

train_losses=[]
valid_losses=[]
tb = SummaryWriter()

for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = train()
    valid_loss = evaluate()
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.cpu(), f'bert_reg_best_model.pt')
        model.to(device)
        
        print('saving model with loss', valid_loss)
        
        
    tb.add_scalar("train loss", train_loss, epoch)
    tb.add_scalar("valid loss", valid_loss, epoch)
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')
    
tb.flush()
tb.close()

In [None]:
!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage

gpu_usage()                             

import gc
del model
gc.collect()

torch.cuda.empty_cache()
gpu_usage()


In [None]:
model.eval()

all_preds = []
for step,batch in enumerate(val_dataloader):
    batch = [t.to(device) for t in batch]
    sent_id, mask, labels = batch
    with torch.no_grad():
        preds = model(sent_id, mask)
        all_preds += preds.flatten().cpu().tolist()
        
print(len(all_preds))

In [None]:
result_df = pd.DataFrame(data={
    'id': val_df.id,
    'target': val_df.target,
    'preds': S
}).set_index('id')

result_df.to_csv('bert_preds.csv', index_label='id')
!head bert_preds.csv


In [None]:
# from torch.optim import Adam

# for name, param in model.bert.named_parameters():
#     param.requires_grads = False
    
# optimizer = AdamW(model.parameters(),
#                   lr = 2e-6)   
# epochs = 10


In [None]:
# model = torch.load('bert_reg_528.pt')
# model.cuda()
# evaluate()

In [None]:
# test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
# test_df.head()

# tokenizer = torch.load('tokenizer.pt')
# device = 'cpu'
# test_X = test_df['excerpt']
# tokens_test = tokenizer.batch_encode_plus(
#     test_X.tolist(),
#     max_length = 180,
#     pad_to_max_length=True,
#     truncation=True
# )

# test_seq = torch.tensor(tokens_test['input_ids']).to(device)
# test_mask = torch.tensor(tokens_test['attention_mask']).to(device)

# model = torch.load('bert_reg_539.pt')

# model.eval()
# with torch.no_grad():
#   preds = model(test_seq, test_mask)

# result_df = pd.DataFrame({
#     'id': test_df.id,
#     'target': preds.squeeze().tolist()})



# result_df.to_csv('submission.csv')
# result_df.head(10)