In [None]:
#defaul libraries
#https://docs.python.org/ja/
import os
import math
import random
import pprint
import time
import typing
import json
import glob
import warnings
import gc

import numpy as np #https://numpy.org/
import pandas as pd #https://pandas.pydata.org/
import sklearn #https://scikit-learn.org/stable/

import matplotlib.pyplot as plt #https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.html
%matplotlib inline
from tqdm import tqdm #https://tqdm.github.io/

import torch #https://pytorch.org/
import transformers #https://huggingface.co/transformers/

In [None]:
#https://www.kaggle.com/getting-started/140636

import torch
from numba import cuda

# !pip install GPUtil
# from GPUtil import showUtilization as gpu_usage

def free_gpu_cache():
#     print("Initial GPU Usage")
#     gpu_usage()                             

    torch.cuda.empty_cache()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

#     print("GPU Usage after emptying the cache")
#     gpu_usage()

#free_gpu_cache()

In [None]:
class CFG():
    
    input_path="../input/commonlitreadabilityprize"
    debug=False
    seed=3

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    batch_size=16
    epochs=10
    learning_rate=2e-5
    kFold=5

    #高速化関連
    #https://qiita.com/sugulu_Ogawa_ISID/items/62f5f7adee083d96a587

    #GPU 遅くなるらしい↓
    torch.backends.cudnn.deterministic = True

    #イテレーションごとのnnの順伝搬および誤差関数の 計算手法がある程度一定であれば、torch.backends.cudnn.benchmark = Trueで GPU での計算が高速化
    torch.backends.cudnn.benchmark = False


def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #tf.random.set_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(CFG.seed)

In [None]:
def color(string,fg='DEFAULT',bg='DEFAULT',fg_rgb=None,bg_rgb=None,style='END'):
    colors=['BLACK','RED','GREEN','YELLOW','BLUE','PURPLE','CYAN','WHITE','8','DEFAULT']
    styles=['END','BOLD','2','3','UNDERLINE','5','6','REVERSE','INVISIBLE','9']

    fg=f'\033[3{colors.index(fg)}m'
    bg=f'\033[4{colors.index(bg)}m'
    style=f'\033[0{styles.index(style)}m'

    if fg_rgb:fg=f"\033[38;2;{fg_rgb[0]};{fg_rgb[1]};{fg_rgb[2]}m"
    if bg_rgb:bg=f"\033[48;2;{bg_rgb[0]};{bg_rgb[1]};{bg_rgb[2]}m"

    return style+fg+bg+str(string)+'\033[0m'

In [None]:
train=pd.read_csv(os.path.join(CFG.input_path,"train.csv"),index_col='id',usecols=['id','excerpt', 'target'])
test=pd.read_csv(os.path.join(CFG.input_path,"test.csv"),index_col='id',usecols=['id','excerpt'])
sample_submission=pd.read_csv(os.path.join(CFG.input_path,"sample_submission.csv"),index_col='id')

if CFG.debug:train=train[:len(train)//30]

test['target']=0

df=pd.concat([train,test])

In [None]:
df

In [None]:
tokenizer=transformers.RobertaTokenizer.from_pretrained("../input/clrp-roberta-base/clrp_roberta_base")

In [None]:
df['token_len']=df.excerpt.apply(tokenizer.encode).apply(len)
#df['token_len_']=df.excerpt.str.split().apply(len)

max_len=max(df['token_len'])
max_len

In [None]:
train=df[:len(train)].copy()
test=df[len(train):].copy()

In [None]:
train

In [None]:
test

In [None]:
class CommonLit_DataSet(torch.utils.data.Dataset):
    def __init__(self,sentences,targets):
        self.sentences=sentences
        self.targets=targets

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self,idx):
        
        enc=tokenizer(
            self.sentences[idx].replace('\n', ''),
            add_special_tokens=True,
            return_attention_mask=True,
            padding='max_length',
            max_length=max_len,
        )
        return {
            'ids':torch.tensor(enc['input_ids'],dtype=torch.long),
            'mask':torch.tensor(enc['attention_mask'],dtype=torch.long),
            #'token_type_ids':torch.tensor(enc['token_type_ids'],dtype=torch.long),
            'targets':torch.tensor(self.targets[idx],dtype=torch.float)
        }

In [None]:
class RMSELoss(torch.nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        self.eps = eps
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y) + self.eps)
        return loss
    
loss_fun=RMSELoss()

In [None]:
#とりあえず学習できるように
def train_val_test(model,dataloader,phase,optimizer=None,scheduler=None,scaler=None):# モデルを学習させる関数
    
    #assert((phase=='train')==bool(optimizer))#学習時にoptimizer必須
    
    model.train() if phase=='train' else model.eval()   # モデルのモード
    model.to(CFG.device)
    
    preds=[]
    losses=[]
    
    # データローダーからミニバッチを取り出すループ
    for enc in dataloader[phase]:
        
        # optimizerを初期化
        if phase=='train':optimizer.zero_grad()
       
        # 順伝搬（forward）計算
        with torch.set_grad_enabled(phase=='train'):
            
            # non_blocking=TrueでPinned MemoryからGPUに転送中もCPUが動作できるらしい。
            ids = enc["ids"].to(CFG.device,non_blocking=True) 
            mask = enc["mask"].to(CFG.device,non_blocking=True)
            
            with torch.cuda.amp.autocast():
                outputs = model(ids,mask)["logits"].squeeze(-1)

                if phase!='test':
                    loss_val = loss_fun(outputs, enc["targets"].to(CFG.device,non_blocking=True))  # 損失を計算
                    losses.append(loss_val.item())

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
    #                     loss_val.backward()
    #                     optimizer.step()
                        scaler.scale(loss_val).backward() # ロスのバックワード
                        scaler.step(optimizer) # オプティマイザーの更新
                        scaler.update() # スケーラーの更新
                    
                        scheduler.step() # 学習率の更新 ここに入れるべき？　まだ使わない

        preds.extend(outputs.detach().cpu().numpy())

#         del ids,mask#####,outputs
#         if phase!='test':del loss_val
#         torch.cuda.empty_cache()
        gc.collect()
    
    #if phase!='test':return preds,np.mean(losses)
    return preds

In [None]:
#https://www.kaggle.com/chumajin/bert-v-s-roberta-english

from sklearn.model_selection import KFold
from transformers import get_linear_schedule_with_warmup


def initialize(seed,fold):
    
    set_seed(seed)
    
    kf=KFold(n_splits=CFG.kFold,shuffle=True,random_state=CFG.seed)
    train_index, valid_index = list(kf.split(train))[fold]

    dataset={
        'train':CommonLit_DataSet(train.iloc[train_index].excerpt, train.iloc[train_index].target),
        'valid':CommonLit_DataSet(train.iloc[valid_index].excerpt, train.iloc[valid_index].target),
        'test':CommonLit_DataSet(test.excerpt,test.target),
    }
    
    dataloader={
        'train':
        torch.utils.data.DataLoader(
            dataset['train'],
            batch_size=CFG.batch_size,
            shuffle=True,
            num_workers=2,#os.cpu_count(),
            pin_memory=True
        ),
        'valid':
        torch.utils.data.DataLoader(
            dataset['valid'],
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=2,#os.cpu_count(),
            pin_memory=True
        ),
        'test':
        torch.utils.data.DataLoader(
            dataset['test'],
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=2,#os.cpu_count(),
            pin_memory=True
        )
    }
    
    model = transformers.RobertaForSequenceClassification.from_pretrained("../input/clrp-roberta-base/clrp_roberta_base",num_labels=1)
    optimizer = transformers.AdamW(model.parameters(), CFG.learning_rate,betas=(0.9, 0.999), weight_decay=1e-2) # AdamW optimizer
    
    train_steps = int(len(train)/CFG.batch_size*CFG.epochs)
    scheduler = get_linear_schedule_with_warmup(optimizer, int(train_steps*0.1), train_steps)
    
    scaler = torch.cuda.amp.GradScaler() # GPUでの高速化。
    
    return dataset,dataloader,model,optimizer,scheduler,scaler
    

In [None]:
def cross_validation():

    for fold in range(CFG.kFold):
        print('fold',fold)
        losses={
            'train':[],
            'valid':[]
        }
    
        dataset,dataloader,model,optimizer,scheduler,scaler=initialize(CFG.seed,fold)
        bestscore=100

        for epoch in range(CFG.epochs):
            print("epoch=",epoch)

            preds=train_val_test(model,dataloader,'train',optimizer,scheduler,scaler)
            plt.scatter(dataset['train'].targets,preds,color='blue',s=5)
            
            loss=np.sqrt(sklearn.metrics.mean_squared_error(preds,list(dataset['train'].targets)))
            print(color("train",bg="BLUE",style='BOLD')+':'+color(f"{loss}","BLUE"))
            losses['train'].append(loss)

            preds=train_val_test(model,dataloader,'valid')
            plt.scatter(dataset['valid'].targets,preds,color='red',s=5)
            
            
            loss=np.sqrt(sklearn.metrics.mean_squared_error(preds,list(dataset['valid'].targets)))
            print(color("val  ",bg="RED",style='BOLD')+':'+color(f"{loss}","RED"))
            losses['valid'].append(loss)
            
            if bestscore > loss:
                bestscore = loss
                print(color("BEST SCORE",bg='YELLOW')+' :',color(bestscore,'YELLOW'))
                
                torch.save(
                    {
                        'state_dict': model.state_dict(),
                        'optimizer_dict': optimizer.state_dict(),
                        'bestscore':bestscore,
                        'seed':CFG.seed
                    },
                    "Roberta_fold"+str(fold)+".pth"
                )
            #print(preds)
            
            plt.plot([i/10 for i in range(-40,20,1)],[i/10 for i in range(-40,20,1)],linestyle='dashed',color='green')
            plt.show()
            
        plt.plot(losses['train'],color='blue')
        plt.plot(losses['valid'],color='red')
        plt.show()

In [None]:
%%time
cross_validation()

In [None]:
best_models=["Roberta_fold0.pth","Roberta_fold1.pth","Roberta_fold2.pth","Roberta_fold3.pth","Roberta_fold4.pth"]

preds=pd.DataFrame(columns=best_models)
preds['id']=test.index
preds=preds.set_index('id')

preds

for pth in best_models:
    
    
    model = transformers.RobertaForSequenceClassification.from_pretrained("../input/clrp-roberta-base/clrp_roberta_base",num_labels=1)
    model.load_state_dict(torch.load(pth)["state_dict"])
    
    score=torch.load(pth)["bestscore"]
    
#     if score>0.54:
#         preds.drop(pth, axis=1)
#         continue
        
    print(score)

    
    dataloader={'test':torch.utils.data.DataLoader(
            CommonLit_DataSet(test.excerpt,test.target),
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=2,#os.cpu_count(),
            pin_memory=True
    )}
    
    preds[pth]=train_val_test(model,dataloader,'test')

In [None]:
preds

In [None]:
submission=pd.DataFrame(columns=['target'])
submission['id']=test.index
submission=submission.set_index('id')
submission.target=preds.mean(axis=1)
submission

In [None]:
submission.to_csv("submission.csv",index=id)