# About this notebook
这个notebook更多的是对比赛和Bert模型的**熟悉、学习**，在目前阶段更多的是参考别人写的代码，熟悉整个比赛和建模流程，后续可以参考其他paper或者其他技巧提升模型的精度和性能

## Idea:
The main idea of this model, it uses the huggingFace pretrain model as the tokenizer and the regression model as well
## Goal: 
| 此阶段只追求完成，start small, step by step建立模型
- [x] 简易EDA
- [x] 建立baseline model
- [x] Submit

# Reference:
- [BERT Biginner](https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room/notebook)
- [LightWeight RoBerta](https://www.kaggle.com/andretugan/lightweight-roberta-solution-in-pytorch/notebook#Dataset)

# 0.Setup

In [None]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import random

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import warnings

warnings.simplefilter('ignore')

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm

In [None]:
SEED = 824

def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

random_seed(SEED)

In [None]:
# Hyper-Paremeters
NUM_FOLDS = 5
NUM_EPOCHS = 20
MAX_LEN = 314
BATCH_SIZE = 16
LR = 1e-5

FILE_PATH = {
    'train': '../input/commonlitreadabilityprize/train.csv',
    'test': '../input/commonlitreadabilityprize/test.csv',
    'submit': '../input/commonlitreadabilityprize/sample_submission.csv'
}
TOKEN_PATH = '../input/huggingface-bert/bert-base-cased'
DEVICE = 'cuda' if torch.cuda.is_available else 'cpu'
print('Using ',DEVICE)

# 1.Simple EDA of data

In [None]:
# train: id, url_legal, license, excerpt, target, std_error
# test: id, url_legal, license, excerpt
train_df = pd.read_csv(FILE_PATH['train'])
test_df = pd.read_csv(FILE_PATH['test'])
print(train_df.nunique(), '\n')
train_df.head()

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained(TOKEN_PATH)
sample_excerpt = train_df['excerpt'].iloc[1]

sample_token = tokenizer.encode_plus(
    sample_excerpt,
    add_special_tokens=True,
    max_length = MAX_LEN,
    pad_to_max_length = True,
    truncation=True) # decode by 'tokenizer.decode(sample_token['input_ids'])'
#sample_token

The output of tokenizer:
- input_ids: Words id. Special: 101[CLS], 102[SEP]: begin and end of sentence
    - 102[SEP]: seperate the sentence, also can represent start of next sentence
- token_type_ids:  Binary mask to grasp sentences: This time it is a regression problem, all 0. When looking at the connection between sentences, change it by inserting [SEP] etc. in the middle.
    - token_type_ids 可选。就是 token 对应的句子id，值为0或1（0表示对应的token属于第一句，1表示属于第二句）。形状为(batch_size, sequence_length)。
- Attention_mask: 0 if element == [PAD] else 1, for mask modeling

# 3. Design model
## 3.1 Divide data into KFold
### Reference:
- [Create Folds](https://www.kaggle.com/abhishek/step-1-create-folds)
- [Lightweight Roberta](https://www.kaggle.com/andretugan/lightweight-roberta-solution-in-pytorch/notebook#Model)
- [BERT beginner](https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room/notebook#2.-BERT:-Deepen-your-understanding-of-Tokenizer)

In [None]:
def create_folds(df, num_folds):
    df['fold'] = -1
    
    # shuffle rows with inplacement and reset index, 
    # where drop=True prevent create a new column to store the old index
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Sturge's rule to determine the number of bins
    nums_bin = int(1 + 3.22 * np.log10(len(df)))
    
    df.loc[:, 'bins'] = pd.cut(df['target'], bins=nums_bin, labels=False)
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True)
    for fold, (train_id, val_id) in enumerate(kf.split(X=df, y=df.bins.values)):
        df.loc[val_id, 'fold'] = fold
    
    df = df.drop('bins', axis=1)
    return df
train_df = create_folds(train_df, NUM_FOLDS)
print(train_df.fold.value_counts())
train_df.head(8)

## 3.2 setup dataset and dataLoader

In [None]:
class MyDataset(Dataset):
    def __init__(self, sentences, targets, is_train=True):
        self.sentences = sentences
        self.targets = targets if is_train else None
        self.is_train = is_train
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        s = self.sentences[index]
        if self.is_train:
            t = self.targets[index]
        
        # tokenization
        token_s = tokenizer.encode_plus(
            s, # the sentence
            add_special_tokens = True,
            max_length = MAX_LEN,
            pad_to_max_length = True,
            return_attention_mask = True,
            truncation=True)
        if self.is_train:
            target_tensor = torch.tensor(t, dtype=torch.float)
        ids = torch.tensor(token_s['input_ids'], dtype = torch.long)
        mask = torch.tensor(token_s['attention_mask'], dtype = torch.long)
        
        if self.is_train:
            return {'ids' : ids, 'mask' : mask,'targets' : target_tensor}
        else:
            return {'ids' : ids, 'mask' : mask}

# 4. Build BERT Model

In [None]:
model = transformers.BertForSequenceClassification.from_pretrained(TOKEN_PATH, num_labels=1)
model.to(DEVICE)
model_original_stat_dict = model.state_dict()

# 5. Train Function
**Load dataset**
> Note: currently only using one fold for testing
- val: fold 0, train: remaining folds

**Input Variables of train_function**
- dataLoader
- model
- optimizer
- scheduler

In [None]:
from collections import defaultdict
def train(model, dataloaders, optimizer, scheduler):
    scores = defaultdict(list)
    losses = defaultdict(list)
    best_model_wts = None
    best_score = float('inf')
    
    for epoch in tqdm(range(NUM_EPOCHS)):
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)
            else:
                model.train(False)
        
            preds, targets, epoch_losses = [], [], [] # store info at each epoch
        
            for data in dataloaders[phase]:
                optimizer.zero_grad()

                ids = data['ids'].to(DEVICE)
                mask = data['mask'].to(DEVICE)
                target = data['targets'].to(DEVICE)

                output = model(ids, mask)
                output = output['logits'].squeeze(-1)
                loss = torch.sqrt(nn.MSELoss()(output, target))

                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                epoch_losses.append(loss.item())
                preds.append(output.detach().cpu().numpy())
                targets.append(target.detach().cpu().numpy())

            preds = np.concatenate(preds)
            targets = np.concatenate(targets)
            
            losses[phase].append(np.mean(epoch_losses))
            
            score = np.sqrt(mean_squared_error(preds, targets))
            scores[phase].append(score)
            
            if phase == 'val' and score < best_score:
                best_score = score
                best_model_wts = model.state_dict()
            
    print('Best score:',best_score)
    return best_model_wts, best_score, [scores, losses]

In [None]:
infos = {} # to store losses and scores of each model
for cur_fold in range(NUM_FOLDS):
    print('-'*5, 'start {}'.format(cur_fold), '-'*5)
    
    model.load_state_dict(model_original_stat_dict)
    # dataset 
    p_train = train_df[train_df['fold'] != cur_fold].reset_index(drop=True)
    p_valid = train_df[train_df['fold'] == cur_fold].reset_index(drop=True)

    train_dataset = MyDataset(p_train['excerpt'], p_train['target'])
    val_dataset = MyDataset(p_valid['excerpt'], p_valid['target'])

    train_dataLoader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_dataLoader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    dataLoaders = {'train': train_dataLoader, 'val': val_dataLoader}

    # setup optimizer and scheduler
    optimizer = AdamW(model.parameters(), LR, betas=(0.9, 0.99), weight_decay=1e-2)
    train_steps = len(p_train) // BATCH_SIZE * NUM_EPOCHS
    num_steps = int(train_steps/10) # decay at each 10% steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

    model_stat_dict, score, info = train(model, dataLoaders, optimizer, scheduler)
    infos[cur_fold] = info
    
    save_path = f'model_{cur_fold}.pth'
    torch.save(model_stat_dict, save_path)

# 6. Submittion

In [None]:
def get_preds(dataloader, model):
    preds = []
    with torch.no_grad():
        for data in dataloader:
            ids = data["ids"].to(DEVICE)
            mask = data["mask"].to(DEVICE)
            output = model(ids, mask)
            output = output['logits'].squeeze(-1)
            preds.append(output.detach().cpu().numpy())
    preds = np.concatenate(preds)  
    return preds            

In [None]:
test_df = pd.read_csv(FILE_PATH['test'])
model_path = [f'model_{i}.pth' for i in range(5)]
test_dataset = MyDataset(test_df['excerpt'], None, False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)
all_preds = []
for fold in range(5):
    model_path = f'model_{fold}.pth'
    model.load_state_dict(torch.load(model_path))   
    all_preds.append(get_preds(test_loader, model))
all_preds

In [None]:
score = pd.DataFrame(all_preds).T.mean(axis=1)
score

In [None]:
sample = pd.read_csv(FILE_PATH['submit'])
sample['target'] = score
sample

In [None]:
sample.to_csv("submission.csv",index = False)