In [None]:
%config Completer.use_jedi = False

In [None]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
import matplotlib.pyplot as plt

import transformers
import random

import warnings
warnings.simplefilter('ignore')

scaler = torch.cuda.amp.GradScaler()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

In [None]:
SEED = 20210520

def random_seed():
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

# EDA

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train.head(7)

In [None]:
test

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('../input/huggingface-bert/bert-base-uncased/')

In [None]:
test['excerpt'].iloc[0]

In [None]:
sentence_length = []

for sentence in tqdm(train['excerpt']):
    token_words = tokenizer.encode_plus(sentence)['input_ids']
    sentence_length.append(len(token_words))
    
print('maxlength:', max(sentence_length))

In [None]:
sample_tokenize = tokenizer.encode_plus(train['excerpt'][0])
sample_tokenize

In [None]:
tokenizer.decode(sample_tokenize['input_ids'])

In [None]:
pad_sentence = tokenizer.encode_plus(
    train['excerpt'][0],
    add_special_tokens = True,
    max_length = 314,
    pad_to_max_length = True,
    truncation = True
)

In [None]:
tokenizer.decode(pad_sentence['input_ids'])

# preprocess

In [None]:
train_data = train.sort_values('target').reset_index(drop=True)
train_data

In [None]:
train_data['kfold'] = train_data.index % 5
train_data

In [None]:
p_train_data = train_data[train_data['kfold']!=0].reset_index(drop=True)
p_val_data = train_data[train_data['kfold']==0].reset_index(drop=True)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, sentences, targets):
        self.sentences = sentences
        self.targets = targets
        
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        
        encode_sentence = tokenizer.encode_plus(
                                sentence,
                                add_special_tokens = True,
                                max_length = 314,
                                pad_to_max_length = True,
                                truncation = True,
                                return_attention_mask = True
                        )
        
        ids = torch.tensor(encode_sentence['input_ids'], dtype=torch.long)
        mask = torch.tensor(encode_sentence['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(encode_sentence['token_type_ids'], dtype=torch.long)
        
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        
        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': target
        }

In [None]:
train_dataset = BERTDataset(p_train_data['excerpt'], p_train_data['target'])
val_dataset = BERTDataset(p_val_data['excerpt'], p_val_data['target'])

In [None]:
train_dataset[0]

In [None]:
train_batch = 8
val_batch = 32

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=train_batch, shuffle=True, num_workers=8, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=val_batch, shuffle=False, num_workers=8, pin_memory=True)

for a in train_dataloader:
    print(a)
    break

In [None]:
model = transformers.BertForSequenceClassification.from_pretrained('../input/huggingface-bert/bert-base-uncased/', num_labels=1)
model.to(device)

In [None]:
for train_encode in train_dataloader:
    ids = train_encode['ids'].to(device)
    mask = train_encode['mask'].to(device)
    
    output = model(ids, mask)
    break
output

In [None]:
print(output['logits'].squeeze(-1))
print(output['logits'].squeeze(-1).shape)

Training

In [None]:
from transformers import AdamW
LR = 2e-5
optimizer = AdamW(model.parameters(), LR, betas=(0.9, 0.999), weight_decay=1e-2)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 20

train_steps = int(len(p_train_data)/train_batch*epochs)
print(train_steps)

num_steps = int(train_steps*0.1)

scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

In [None]:
# 学習率の推移
# le = []
# train_dataloader = DataLoader(train_dataset,
#                               batch_size = train_batch,
#                               shuffle = True,
#                               num_workers = 8,
#                               pin_memory = True
#                              )

# for epoch in tqdm(range(epochs)):
#     for train_dl in train_dataloader:
#         le.append(scheduler.get_last_lr())
#         scheduler.step()
# x = np.arange(len(le))
# plt.plot(x, le)

In [None]:
def loss_fn(output, target):
    loss = nn.MSELoss()
    return torch.sqrt(loss(output, target))

In [None]:
def training(train_dataloader, model, optimizer, scheluder):
    model.train()
    torch.backends.cudnn.benchmark = True
    
    all_preds = []
    all_targets = []
    losses = []

    for train_dl in train_dataloader:

        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():
            ids = train_dl['ids'].to(device, non_blocking=True)
            mask = train_dl['mask'].to(device, non_blocking=True)
            
            output = model(ids, mask)
            output = output['logits'].squeeze(-1)
            
            target = train_dl['targets'].to(device, non_blocking=True)
            
            loss = loss_fn(output, target)
            pred = output.detach().cpu().numpy()

            losses.append(loss.item())
            all_preds.append(pred)
            all_targets.append(target.detach().squeeze(-1).cpu().numpy())
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        scheduler.step()

        del loss, ids, mask
    
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    
    losses = np.mean(losses)
    train_rme_loss = np.sqrt(mean_squared_error(all_targets, all_preds))
    
    return losses, train_rme_loss
    

In [None]:
def validating(val_dataloader, model):
    model.eval()
    
    all_preds = []
    all_targets = []
    losses = []
    
    for val_dl in val_dataloader:
        with torch.no_grad():
            ids = val_dl['ids'].to(device)
            mask = val_dl['mask'].to(device)
            
            output = model(ids, mask)
            output = output['logits'].squeeze(-1)
            
            target = val_dl['targets'].to(device)
            
            loss = loss_fn(output, target)
            
            losses.append(loss.item())
            all_preds.append(output.detach().cpu().numpy())
            all_targets.append(target.detach().squeeze(-1).cpu().numpy())
            
            del loss, ids, mask
            
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    
    losses = np.mean(losses)
    
    val_rme_loss = np.sqrt(mean_squared_error(all_targets, all_preds))
    
    return all_preds, losses, val_rme_loss

In [None]:
train

In [None]:
p_train = train_data[train_data['kfold']!=0].reset_index(drop=True)
p_val = train_data[train_data['kfold']==0].reset_index(drop=True)

train_batch = 16
val_batch = 32

train_dataset = BERTDataset(p_train['excerpt'], p_train['target'])
val_dataset = BERTDataset(p_val['excerpt'], p_val['target'])

train_dataloader = DataLoader(train_dataset,
                               batch_size = train_batch,
                               shuffle = True,
                               num_workers = 4,
                               pin_memory = True)

val_dataloader = DataLoader(val_dataset,
                           batch_size = val_batch,
                           shuffle = False,
                           num_workers = 4,
                           pin_memory = True)

model = transformers.BertForSequenceClassification.from_pretrained('../input/huggingface-bert/bert-base-uncased/', num_labels=1)
model.to(device)

learning_rate = 2e-5
optimizer = AdamW(model.parameters(), learning_rate, betas=(0.9, 0.999), weight_decay=1e-2)

train_steps = int(len(p_train)/train_batch*epochs)
num_steps = int(train_steps*0.1)

scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

In [None]:
train_losses = []
val_losses = []
best_score = None

train_scores = []
val_scores = []

for epoch in range(epochs):
    print(f'Epochs: {epoch+1}/{epochs}')

    train_loss, train_score = training(train_dataloader, model, optimizer, scheduler)
    train_losses.append(train_loss)
    train_scores.append(train_score)
    
    preds, val_loss, val_score = validating(val_dataloader, model)
    val_losses.append(val_loss)
    val_scores.append(val_score)
    
    print(f'train loss {train_loss}, train RME {train_score}')
    print(f'validation loss {val_loss}, validation RME {val_score}')
    
    if best_score is None:
        best_score = val_score
        
        print('Save first model')
        
        state = {
            'state_dict': model.state_dict(),
            'optimizer_dict': optimizer.state_dict(),
            'best_score': best_score
        }
        torch.save(state, 'model0.pth')
        
    elif best_score > val_score:
        best_score = val_score
        
        print('found better point')
        
        state = {
            'state_dict': model.state_dict(),
            'optimizer_dict': optimizer.state_dict(),
            'best_score': best_score
        }
        torch.save(state, 'model0.pth')
    
    else:
        pass

In [None]:
plt.scatter(p_val['target'], preds)

In [None]:
x = np.arange(epochs)
plt.plot(x, train_losses)
plt.plot(x, val_losses)

In [None]:
x = np.arange(epochs)
plt.plot(x, train_scores)
plt.plot(x, val_scores)

In [None]:
best_scores = []
best_scores.append(best_score)

In [None]:
for k in range(1, 5):
    p_train = train_data[train_data['kfold']!=k].reset_index(drop=True)
    p_val = train_data[train_data['kfold']==k].reset_index(drop=True)
    
    train_dataset = BERTDataset(p_train['excerpt'], p_train['target'])
    val_dataset = BERTDataset(p_val['excerpt'], p_val['target'])

    train_dataloader = DataLoader(train_dataset,
                                   batch_size = train_batch,
                                   shuffle = True,
                                   num_workers = 4,
                                   pin_memory = True)

    val_dataloader = DataLoader(val_dataset,
                               batch_size = val_batch,
                               shuffle = False,
                               num_workers = 4,
                               pin_memory = True)

    model = transformers.BertForSequenceClassification.from_pretrained('../input/huggingface-bert/bert-base-uncased/', num_labels=1)
    model.to(device)

    learning_rate = 2e-5
    optimizer = AdamW(model.parameters(), learning_rate, betas=(0.9, 0.999), weight_decay=1e-2)

    train_steps = int(len(p_train)/train_batch*epochs)
    num_steps = int(train_steps*0.1)

    scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)
    
    train_losses = []
    val_losses = []
    best_score = None

    train_scores = []
    val_scores = []

    for epoch in range(epochs):
        print(f'Epochs: {epoch+1}/{epochs}')

        train_loss, train_score = training(train_dataloader, model, optimizer, scheduler)
        train_losses.append(train_loss)
        train_scores.append(train_score)

        preds, val_loss, val_score = validating(val_dataloader, model)
        val_losses.append(val_loss)
        val_scores.append(val_score)

        print(f'train loss {train_loss}, train RME {train_score}')
        print(f'validation loss {val_loss}, validation RME {val_score}')

        if best_score is None:
            best_score = val_score

            print('Save first model')

            state = {
                'state_dict': model.state_dict(),
                'optimizer_dict': optimizer.state_dict(),
                'best_score': best_score
            }
            torch.save(state, f'model{k}.pth')

        elif best_score > val_score:
            best_score = val_score

            print('found better point')

            state = {
                'state_dict': model.state_dict(),
                'optimizer_dict': optimizer.state_dict(),
                'best_score': best_score
            }
            torch.save(state, f'model{k}.pth')

        else:
            pass
    
    best_scores.append(best_score)

In [None]:
best_scores

In [None]:
print(f'score: {np.mean(best_scores)}')

# inference

In [None]:
import gc
del train_dataset, val_dataset, train_dataloader, val_dataloader, model, optimizer, scheduler
_ = gc.collect()

In [None]:
class BERTinfDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

        
    def __len__(self):
        return len(self.sentences)
    
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        
        encode_sentence = tokenizer.encode_plus(
                                sentence,
                                add_special_tokens = True,
                                max_length = 314,
                                pad_to_max_length = True,
                                truncation = True
                        )
        
        ids = torch.tensor(encode_sentence['input_ids'], dtype=torch.long)
        mask = torch.tensor(encode_sentence['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(encode_sentence['token_type_ids'], dtype=torch.long)

        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids
        }

In [None]:
test_dataset = BERTinfDataset(test['excerpt'])
test_batch = 32

test_dataloader = DataLoader(test_dataset,
                            batch_size = test_batch,
                            shuffle = False,
                            num_workers = 4,
                            pin_memory = True)

model = transformers.BertForSequenceClassification.from_pretrained('../input/huggingface-bert/bert-base-uncased/', num_labels=1)

In [None]:
pthes = [os.path.join('./', s) for s in os.listdir('./') if '.pth' in s]
pthes

In [None]:
def predicting(test_dataloader, model, pthes):
    all_preds = []
    
    for pth in pthes:
        state = torch.load(pth)
        model.load_state_dict(state['state_dict'])
        model.to(device)
        model.eval()
        
        preds = []
        all_val_loss = 0
        
        with torch.no_grad():
            for test_dl in test_dataloader:
                ids = test_dl['ids'].to(device)
                mask = test_dl['mask'].to(device)
                token_type = test_dl['token_type_ids'].to(device)
                
                output = model(ids, mask)
                output = output['logits'].squeeze(-1)
                
                preds.append(output.cpu().numpy())

        preds = np.concatenate(preds)
        all_preds.append(preds)
        
    return all_preds

In [None]:
all_preds = predicting(test_dataloader, model, pthes)
df = pd.DataFrame(all_preds).T
df

In [None]:
mean_submission = df.mean(axis=1)
mean_submission

In [None]:
submit_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submit_df['target'] = mean_submission
submit_df

In [None]:
submit_df.to_csv('submission.csv', index=False)