This is a baseline model based on BERT for the CommonLitReadabilityPrize competition. The implementation refers to [BERT beginner](https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room/notebook).

# Table of Contents
1. [Preparation](#preparation)
2. [K-fold](#k-fold)
3. [Model and Training](#model-and-training)
4. [Testing](#testing)
5. [Scoring](#scoring)

# 1. Preparation
**Import all dependencies**

We import all dependencies here and define the model path

In [None]:
import pandas as pd
import os
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import transformers
from transformers import AdamW

In [None]:
# gpu/cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# model loading path
MODEL_PATH = '../input/huggingface-bert/bert-base-uncased'

**Loading training data**

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train

We will only use the "excerpt" and "target" to train the model. So we merely extract the column "excerpt" and "target" from raw data.

In [None]:
train_data = train.iloc[:,3]
train_target = train.iloc[:,4]
# for visualization
train_set = pd.concat([train_data,train_target], axis=1)
train_set

We can partition the train data into train set and test set. So we can use the test set to check the performance during implementing the model. The test set is not the data in "test.csv".

In [None]:
num_data = len(train_set)
msk = np.random.rand(num_data)<1 # we can change it to 1 to make all train data as the train set
training = train_data[msk]
testing = train_data[~msk]
training_target = train_target[msk]
testing_target = train_target[~msk]
train_sample = pd.concat([training, training_target], axis=1)
test_sample = pd.concat([testing, testing_target], axis=1)
train_sample, test_sample

# 1. K-fold

We implement the k-fold with 5 folds. Each iteration, we use only one fold as the validation data, and the rest as the training data.

In [None]:
train_sample = train_sample.values

In [None]:
# partition data into 5 folds
kf = KFold(shuffle=True)

# check the data partition
for train_index, valid_index in kf.split(train_sample):
    print(len(train_sample[train_index]))
    print(len(train_sample[valid_index]))

Then we will tokenize the data and make a dataloader for mini-batch training.

In [None]:
# tokenizer from BERT
tokenizer = transformers.BertTokenizer.from_pretrained(MODEL_PATH)

Define a dataset

In [None]:
class BERTDataSet(Dataset):
    
    def __init__(self,excerpts,targets):
        
        self.excerpts = excerpts
        self.targets = targets
        
    def __len__(self):
        
        return len(self.excerpts)
    
    def __getitem__(self,idx):
        
        excerpt = self.excerpts[idx]
        
        bert_excerpts = tokenizer.encode_plus(
                                excerpt,
                                add_special_tokens = True, 
                                max_length = 314,
                                # pad_to_max_length = True, 
                                padding='max_length',
                                return_attention_mask = True,
                                truncation=True)

        ids = torch.tensor(bert_excerpts['input_ids'], dtype=torch.long)
        mask = torch.tensor(bert_excerpts['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(bert_excerpts['token_type_ids'], dtype=torch.long)
     
            
        target = torch.tensor(self.targets[idx],dtype=torch.float)
        
        return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                'targets': target
            }

# 3. Model and Training

Define the optimizer and the model

In [None]:
LR=2e-5
# model = transformers.BertForSequenceClassification.from_pretrained(MODEL_PATH,num_labels=1)
# model.to(device)
# optimizer = AdamW(model.parameters(), LR,betas=(0.9, 0.999), weight_decay=1e-2)
# model_original_stat_dict = model.state_dict() # store the original weight matrix

In [None]:
# minibatch for dataloader and training epochs
batchSize = 16
epochs = 20

Now, we can implement the training phase.

In [None]:
All_train_losses = []
validate_losses = []
# model_matrix = []
scaler = torch.cuda.amp.GradScaler()
fold = 0
# model_original_stat_dict = model.state_dict()
for train_index, valid_index in kf.split(train_sample):
    model = transformers.BertForSequenceClassification.from_pretrained(MODEL_PATH,num_labels=1)
    optimizer = AdamW(model.parameters(), LR,betas=(0.9, 0.999), weight_decay=1e-2)
    model.to(device)
    train_input = train_sample[train_index]
    valid_input = train_sample[valid_index]
    # print(valid_input.shape)
    train_input = BERTDataSet(train_input[:,0],train_input[:,1])
    valid_input = BERTDataSet(valid_input[:,0],valid_input[:,1])
    # print(train_input)
    train_dataloader = DataLoader(train_input, batch_size = batchSize,shuffle = True,num_workers=4,pin_memory=True)
    valid_dataloader = DataLoader(valid_input, batch_size = batchSize,shuffle = True,num_workers=4,pin_memory=True)
    train_losses = []
    bestScore = None
    for epoch in tqdm(range(epochs)):
        # train phase
        model.train()
        batch_pred = []
        batch_target = []
        for step, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            ids = batch["ids"].to(device,non_blocking=True)
            mask = batch["mask"].to(device,non_blocking=True)
            tokentype = batch["token_type_ids"].to(device,non_blocking=True)

            # print(step)
            output = model(ids,mask)
            output = output["logits"].squeeze(-1)

            target = batch["targets"].to(device,non_blocking=True)

            loss = nn.MSELoss()(output,target)
            batch_pred += list(output.detach().cpu().numpy())
            batch_target += list(target.detach().cpu().numpy())


            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        train_losses.append(np.sqrt(mean_squared_error(batch_pred, batch_target)))

        # print("train_end")
    
        # validate phase
#         if epoch+1==epochs:
        with torch.no_grad():
            valid_pred = []
            valid_targets = []
            model.eval()
            for valid_step, valid_batch in enumerate(valid_dataloader):
                valid_ids = valid_batch["ids"].to(device,non_blocking=True)
                valid_mask = valid_batch["mask"].to(device,non_blocking=True)
                valid_tokentype = valid_batch["token_type_ids"].to(device,non_blocking=True)

                valid_output = model(valid_ids,valid_mask)
                valid_output = valid_output["logits"].squeeze(-1)

                valid_target = valid_batch["targets"].to(device,non_blocking=True)

                # v_loss = nn.MSELoss()(output,target)
                # valid_loss.append(v_loss.item())
                valid_pred += list(valid_output.detach().cpu().numpy())
                valid_targets += list(valid_target.detach().cpu().numpy())

            if bestScore is None:
                bestScore = np.sqrt(mean_squared_error(valid_pred,valid_targets))
                state = {
                            'state_dict': model.state_dict(),
                            'optimizer_dict': optimizer.state_dict(),
                            "bestscore":bestScore
                        }
                torch.save(state, "model" + str(fold) + ".pth")
            elif bestScore > np.sqrt(mean_squared_error(valid_pred,valid_targets)):
                bestsSore = np.sqrt(mean_squared_error(valid_pred,valid_targets))
                state = {
                            'state_dict': model.state_dict(),
                            'optimizer_dict': optimizer.state_dict(),
                            "bestscore":bestScore
                        }
                torch.save(state, "model"+ str(fold) + ".pth")
            else:
                pass
    validate_losses.append(bestScore)
            
    All_train_losses.append(train_losses)

    print('Fold [%d/%d] Train Loss: %.4f  Validate Loss: %.4f'
                  % (fold, 5, All_train_losses[fold-1][-1], validate_losses[fold-1]))
  #     if not os.path.exists(os.path.join('BERT_pretrained',str(fold))):
#         os.makedirs(os.path.join('BERT_pretrained',str(fold)))
#     torch.save(model.state_dict(), os.path.join('BERT_pretrained',str(fold),'baseline3.pth'))
#     model_matrix.append(model.state_dict())
    fold += 1


We can show the average validation score

In [None]:
print(np.mean(validate_losses))

# 4. Testing
If we partition train data into train set and test set, we can use the test set to check the performance.

In [None]:
model = transformers.BertForSequenceClassification.from_pretrained(MODEL_PATH,num_labels=1)

In [None]:
# test_sample = test_sample.values
# test_input = BERTDataSet(test_sample[:,0], test_sample[:,1])
# test_dataloader = DataLoader(test_input,batch_size=int(batchSize),shuffle=True,num_workers=4,pin_memory=True)

# test_scores = []

# for i in range(5):
#     model_weight = model_matrix[i]
#     fold = i + 1
#     model.load_state_dict(model_weight)
#     model.to(device)
#     model.eval()
#     test_pred = []
#     test_targets = []
#     with torch.no_grad():
#         for test_step, test in enumerate(test_dataloader):
#             test_ids = test["ids"].to(device,non_blocking=True)
#             test_mask = test["mask"].to(device,non_blocking=True)
#             test_output = model(test_ids, test_mask)
#             test_output = test_output["logits"].squeeze(-1)
#             test_target = test["targets"].to(device,non_blocking=True)
#             test_pred += list(test_output.detach().cpu().numpy())
#             test_targets += list(test_target.detach().cpu().numpy())

#     test_scores.append(np.sqrt(mean_squared_error(test_pred,test_targets)))
#     print(np.sqrt(mean_squared_error(test_pred,test_targets)))
# print(np.mean(test_scores))

# 5. Scoring
Use the test data in "test.csv" to get the submission file.

In [None]:
All_test_preds = []
pathes = [os.path.join("./",s) for s in os.listdir("./") if ".pth" in s]
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_input = test_df["excerpt"]
test_input = test_input.values
test_target = np.zeros(len(test_input))
# test_id = test_df["id"]
test_input = BERTDataSet(test_input,test_target)
for path in pathes:
    state = torch.load(path)
    model.load_state_dict(state["state_dict"])    

    model.to(device)
    model.eval()
  

    testloader = DataLoader(test_input,batch_size=32,shuffle=False,num_workers=4,pin_memory=True)
  
    test_preds = []
    with torch.no_grad():
        for test_step, test_batch in enumerate(testloader):
            test_ids = test_batch["ids"].to(device)
            test_mask = test_batch["mask"].to(device)
            test_output = model(test_ids, test_mask)
            test_output = test_output["logits"].squeeze(-1)
            test_pred = test_output.detach().cpu().numpy()
            test_preds.append(test_pred)
        test_preds = np.concatenate(test_preds)
        All_test_preds.append(test_preds)

In [None]:
All_test_preds

In [None]:
scoring = pd.DataFrame(All_test_preds)
scoring = scoring.T

In [None]:
aver_score = scoring.mean(axis=1)
aver_score

In [None]:
sample = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
sample

In [None]:
sample["target"] = aver_score
sample

In [None]:
sample.to_csv("submission.csv",index = False)