```
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
```

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import BertModel, BertTokenizer
from tqdm.notebook import tqdm

## Define DataLoader

In [None]:
def load_data(data_filename, mode='train', type='numpy'):
    pd_table = pd.read_csv(data_filename)
    ids, urls, licenses, excerpts, = pd_table['id'], pd_table['url_legal'], pd_table['license'], pd_table['excerpt']

    targets, standard_errors = [pd_table['target'], pd_table['standard_error']] if mode == 'train' else [None, None]


    if type == 'pandas':
        return ids, urls, licenses, excerpts, targets, standard_errors
    elif type == 'numpy':
        return map(np.array, [ids, urls, licenses, excerpts, targets, standard_errors])
    else:
        raise AssertionError('Unknown type [%s]' % type)

In [None]:
class CRPDataset(torch.utils.data.Dataset):
    def __init__(self, mode, tokenizer=lambda x: x):
        ids, urls, licenses, excerpts, targets, standard_errors = load_data(os.path.join(DATA_ROOT_DIR, "%s.csv" % mode), mode=mode)
        self.ids = ids
        self.excerpts, self.targets, self.standard_errors = excerpts, targets, standard_errors
        self.mode = mode
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        if self.mode == 'train':
            return {'excerpt': self.tokenizer(self.excerpts[index],
                                              padding="max_length", 
                                              truncation=True, 
                                              max_length=512)['input_ids'], # only take 'input_ids', discarding 'token_type_ids'
                    'target': self.targets[index],
                    'standard_error': self.standard_errors[index]}
        else:
            return {'id': self.ids[index],
                    'excerpt': self.tokenizer(self.excerpts[index])['input_ids']}
        
    def __len__(self):
        return len(self.ids)

In [None]:
def collate_fn(batch):
    # tokens = [tokenizer(data['excerpt'], padding=True) for data in datas]
    ids = [data['id'] for data in batch]
    tokens = pad_sequence([torch.tensor(data['excerpt'], dtype=torch.long) for data in batch], batch_first=True)
    # targets = torch.tensor([torch.tensor(data['target'], dtype=torch.float32) for data in batch])
    # stds = torch.tensor([torch.tensor(data['standard_error'], dtype=torch.float32) for data in batch])
    # tokens = truncate_or_pad(tensor=tokens, max_length=MAX_SEQ_LEN)
    return ids, tokens

## Define Network

In [None]:
class Baseline_BERT(nn.Module):
    def __init__(self, freeze_bert=True):
        super(Baseline_BERT, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_DIR)
        if freeze_bert:
            self.bert.requires_grad_(False)
        self.regressor = nn.Linear(768, 1)

    def forward(self, tokens):
        bert_output = self.bert(input_ids=tokens, return_dict=True)
        cls = bert_output['pooler_output']
        out = self.regressor(cls)
        return out

## Pipeline

In [None]:
DATA_ROOT_DIR = '/kaggle/input/commonlitreadabilityprize'
BERT_DIR = '/kaggle/input/huggingface-bert/bert-base-chinese'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 1

In [None]:
# Input
tokenizer = BertTokenizer.from_pretrained(BERT_DIR)
test_dataset = CRPDataset('test', tokenizer=tokenizer)
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
                                               batch_size=BATCH_SIZE,
                                               shuffle=False,
                                               collate_fn=collate_fn)

In [None]:
model = Baseline_BERT(freeze_bert=True)
model.load_state_dict(torch.load('/kaggle/input/models/baseline_bert.pt'))
model.to(DEVICE)
model.eval()
print('model loaded.')

In [None]:
# writer = open(os.path.join('/kaggle/working', 'submission.csv'), 'w', encoding='utf-8')
# writer.write('id,target\n')
total_ids, total_outs = [], []
with torch.no_grad():
    for step, (ids, inputs) in tqdm(enumerate(test_dataloader)):
        inputs = inputs.to(DEVICE)
        output = model(tokens=inputs)
        
        outs = output.squeeze().cpu().numpy().tolist()
        
#         for (id, out) in zip(ids, outs):
#             writer.write('%s,%s\n' % (id, out))
        total_ids.append(ids)
        total_outs.append(outs)
# writer.close()

## Dump prediction

In [None]:
import numpy as np
import pandas as pd
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
# test = np.array(test)

test['target'] = np.array(total_outs)
s = pd.DataFrame({'id': test['id'], 'target': test['target']})
s.to_csv("./submission.csv", encoding='utf-8', index=False)

# total_ids, total_outs = np.array(total_ids), np.array(total_outs)
# total_ids = total_ids[:, np.newaxis]
# total_outs = total_outs[:, np.newaxis]
# prediction = np.concatenate((total_ids, total_outs), axis = 1)
# colname = ['id', 'target']
# prediction = pd.DataFrame(data=prediction, columns = colname)
# prediction.to_csv("./submission.csv", encoding='utf-8', index=False, float_format='%.20f')