In [None]:
!pip install transformers["ja"]
!pip install accelerate
!pip install colorama

# for TPU
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from accelerate import Accelerator

from transformers import (AutoModel,
                          AutoModelForSequenceClassification,
                          AutoTokenizer,
                          AutoConfig,
                          get_cosine_schedule_with_warmup
                         )

# for TPU
import torch_xla
import torch_xla.core.xla_model as xm

from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL


In [None]:
config = {
#     'lr': 2e-5,
    'lr': 0.00002,
#     'wd':0.01,
    'wd':1e-5,
    'batch_size':16,
    'valid_step':50,
    'max_len':512,
    'epochs':8,
    'nfolds':5, # もう少し小さくてもよいかも
    'seed':42,
#     'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'device': xm.xla_device(),
    
#     https://huggingface.co/cl-tohoku
    'model_name':'cl-tohoku/bert-base-japanese'
#     'model_name':'cl-tohoku/bert-base-japanese-v2'
#     'model_name':'cl-tohoku/bert-large-japanese'
#     'model_name':''
}

# シード値の固定
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
# # make me
# # https://www.ai-shift.co.jp/techblog/2138

# def cut_head_and_tail(tokenizer,text):
#     # まずは限界を設定せずにトークナイズする
#     input_ids = tokenizer.encode(text)
#     n_token = len(input_ids)

#     # トークン数が最大数と同じ場合
#     if n_token == config["max_len"]:
#         input_ids = input_ids
#         attention_mask = [1 for _ in range(config["max_len"])]
#         token_type_ids = [1 for _ in range(config["max_len"])]
#     # トークン数が最大数より少ない場合
#     elif n_token < config["max_len"]:
#         pad = [1 for _ in range(config["max_len"]-n_token)]
#         input_ids = input_ids + pad
#         attention_mask = [1 if n_token > i else 0 for i in range(config["max_len"])]
#         token_type_ids = [1 if n_token > i else 0 for i in range(config["max_len"])]
#     # トークン数が最大数より多い場合
#     else:
#         harf_len = (config["max_len"]-2)//2
#         _input_ids = input_ids[1:-1]
#         input_ids = [0]+ _input_ids[:harf_len] + _input_ids[-harf_len:] + [2]
#         attention_mask = [1 for _ in range(config["max_len"])]
#         token_type_ids = [1 for _ in range(config["max_len"])]

#     d = {
#         "input_ids": torch.tensor(input_ids, dtype=torch.long),
#         "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
#         "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
#     }   
#     return d

# def make_max_len_context(text):
#     tokenizer=AutoTokenizer.from_pretrained(config['model_name'])
#     decode = cut_head_and_tail(tokenizer, text)
#     context = tokenizer.decode(decode['input_ids'], skip_special_tokens=True)
#     return context

# # make_512_context(AutoTokenizer.from_pretrained(config['model_name']), train_all_df['context'][0])

In [None]:
# データの読み込み
# train_all_df = pd.read_csv('../input/fakenews-nlp/train.csv')
# test_df = pd.read_csv('../input/fakenews-nlp/test.csv')
train_all_df = pd.read_csv('../input/truncation-512-tokens/512_train.csv')
test_df = pd.read_csv('../input/truncation-512-tokens/512_test.csv')
sample_sub = pd.read_csv('../input/fakenews-nlp/sample_submission.csv')

train_all_df['context_simple_len'] = train_all_df['context'].map(lambda x: len(x))
test_df['context_simple_len'] = test_df['context'].map(lambda x: len(x))

display(train_all_df.head())
display(test_df.head())
display(sample_sub.head())
print('='*20)
display(train_all_df.describe())
display(test_df.describe())

In [None]:
# train_all_df['context'] = train_all_df['context'].map(lambda x: make_max_len_context(x))
# test_df['context'] = test_df['context'].map(lambda x: make_max_len_context(x))

# train_all_df['context_simple_len'] = train_all_df['context'].map(lambda x: len(x))
# test_df['context_simple_len'] = test_df['context'].map(lambda x: len(x))

# display(train_all_df.head())
# display(test_df.head())
# display(sample_sub.head())
# print('='*20)
# display(train_all_df.describe())
# display(test_df.describe())

In [None]:
# train_all_df.to_csv('512_train.csv', index=False)
# test_df.to_csv('512_test.csv', index=False)

In [None]:
# foldの割当
train_all_df['Fold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_all_df,y=train_all_df['isfake'])):
    train_all_df.loc[valid_idx,'Fold'] = k

In [None]:
# データセットの定義
class SeqDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=128):
        self.targets = df['isfake'].to_numpy()
        self.context = df['context'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.context[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
 
        target = torch.tensor(self.targets[idx],dtype=torch.float) 
        return encode, target
    
    def __len__(self):
        return len(self.context)

In [None]:
# モデルの定義

class AttentionHead(nn.Module):
    ''' 
    BERT のヘッドにつけるアテンション機構。デフォルトのものを用いても良いが、オリジナルのものを作成して学習することも可能
    '''
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    

class Model(nn.Module):
    '''
    モデル本体
    '''
    def __init__(self,path):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained(path)  
        self.config = AutoConfig.from_pretrained(path)

        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
#         self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
#         x = self.dropout(x)
        x = self.linear(x)
        return x

In [None]:
# 推論用
class TestSeqDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['context'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

def get_prediction(df,path,device=config['device']):        
    model = Model(config['model_name'])
    tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
    model.load_state_dict(torch.load(path,map_location=device))
    model.to(device)
    model.eval()
    
    test_ds = TestSeqDataset(df,tokenizer)
    test_dl = DataLoader(test_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 4,
                        pin_memory=True)
    
    predictions = list()
    for i, (inputs) in tqdm(enumerate(test_dl)):
        inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
        outputs = model(**inputs)
        outputs = outputs.cpu().detach().numpy().ravel().tolist()
        predictions.extend(outputs)
        
    torch.cuda.empty_cache()
    return np.array(predictions)

# 評価指標
def eval_fn(outputs,targets):
    outputs =  torch.tensor(outputs, dtype=torch.float) 
    targets =  torch.tensor(targets, dtype=torch.float) 
    outputs = outputs.view(-1)
    targets = targets.view(-1)
    return torch.sqrt(nn.MSELoss()(outputs,targets)).cpu().detach().numpy().ravel().tolist()[0]


In [None]:
# 学習
def run(fold,verbose=True):

    def loss_fn(outputs,targets):
        outputs = outputs.view(-1)
        targets = targets.view(-1)
        return torch.sqrt(nn.MSELoss()(outputs,targets))  

    def train_and_evaluate_loop(train_loader,valid_loader, model, loss_fn,optimizer,epoch,fold,best_loss,valid_step=10,lr_scheduler=None):
        train_loss = 0


        for i, (inputs1,targets1) in enumerate(tqdm(train_loader)):
            model.train()
            optimizer.zero_grad()
            inputs1 = {key:val.reshape(val.shape[0],-1) for key,val in inputs1.items()}
            outputs1 = model(**inputs1)
            loss1 = loss_fn(outputs1,targets1)
            loss1.backward()
            optimizer.step()
            
            train_loss += loss1.item()
            
            if lr_scheduler:
                lr_scheduler.step()
            
            # evaluating for every valid_step
            # ここで保存するのの厳選してるのか
            # 何してるのか把握すべきか...
            # valid_stepってなに? => 今回は50 => ここいらなくね?
            # 結局ここでは学習してないから、リークはしてなさそう. 
            # 上のほうで結局全てのtrainについて学習してるし、何してるんだこれ?

            if (i % valid_step == 0) or ((i + 1) == len(train_loader)):

                model.eval()
                valid_loss = 0
                with torch.no_grad():
                    for j, (inputs2,targets2) in enumerate(valid_loader):
                        inputs2 = {key:val.reshape(val.shape[0],-1) for key,val in inputs2.items()}
                        outputs2 = model(**inputs2)
                        loss2 = loss_fn(outputs2,targets2)
                        valid_loss += loss2.item()
                     
                    valid_loss /= len(valid_loader)
                    if valid_loss <= best_loss:
                        if verbose:
                            print(f"epoch:{epoch} | Train Loss:{train_loss/(i+1)} | Validation loss:{valid_loss}")
                            print(f"{g_}Validation loss Decreased from {best_loss} to {valid_loss}{sr_}")

                        best_loss = valid_loss
                        torch.save(model.state_dict(),f'./model{fold}/model{fold}.bin')
                        tokenizer.save_pretrained(f'./model{fold}')

            


        return best_loss
    
    
    accelerator = Accelerator()
    print(f"{accelerator.device} is used")
    
    train_df, valid_df= train_all_df.query(f"Fold != {fold}"), train_all_df.query(f"Fold == {fold}")
    tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
    model = Model(config['model_name'])

    train_ds = SeqDataset(train_df, tokenizer,config['max_len'])
    train_dl = DataLoader(train_ds,
                        batch_size = config["batch_size"],
                        shuffle=True,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    valid_ds = SeqDataset(valid_df,tokenizer,config['max_len'])
    valid_dl = DataLoader(valid_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    optimizer = optim.AdamW(model.parameters(),lr=config['lr'],weight_decay=config['wd'])
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps= 10 * len(train_dl))

    model,train_dl,valid_dl,optimizer,lr_scheduler = accelerator.prepare(model,train_dl,valid_dl,optimizer,lr_scheduler)

    print(f"Fold: {fold}")
    os.makedirs(f'model{fold}',exist_ok=True)
    best_loss = 9999
    for epoch in range(config["epochs"]):
        print(f"Epoch Started:{epoch}")
        best_loss = train_and_evaluate_loop(train_dl,valid_dl,model,loss_fn,
                                            optimizer,epoch,fold,best_loss,
                                            valid_step=config['valid_step'],lr_scheduler=lr_scheduler)


    # oof用
    pred = get_prediction(valid_df, f'./model{fold}/model{fold}.bin')
    torch.save({'pred': pred}, f'./model{fold}/oof_{fold}.bin')
    check_point = torch.load(f'./model{fold}/oof_{fold}.bin') 
    # valid_df['pred'] = check_point['pred'].cpu().detach().numpy().ravel().tolist()
    valid_df['pred'] = check_point['pred']
    return valid_df


    

In [None]:
# 学習
oof_df = pd.DataFrame()   
for f in range(config['nfolds']):
    _oof_df = run(f)
    oof_df = pd.concat([oof_df, _oof_df])
    _oof_score = eval_fn(_oof_df['pred'].clip(0,2).values, _oof_df['isfake'].values)
    print(f"========== oof_{f}: {_oof_score} ==========")

oof_score = eval_fn(oof_df['pred'].clip(0,2).values, oof_df['isfake'].values)
print(f"========== oof: {oof_score} ==========")

In [None]:
# 予測
pred1 = get_prediction(test_df,'./model0/model0.bin')
pred2 = get_prediction(test_df,'./model1/model1.bin')
pred3 = get_prediction(test_df,'./model2/model2.bin')
pred4 = get_prediction(test_df,'./model3/model3.bin')
pred5 = get_prediction(test_df,'./model4/model4.bin')

In [None]:
# せっかく作ったしoofでアンサンブルしてもよさそう
sample_sub['isfake'] = (pred1 + pred2 + pred3 + pred4 + pred5)/5
# sample_sub['isfake'] = (pred1 + pred2 + pred3)/3
sample_sub['isfake'] = sample_sub['isfake'].clip(0,2)
sample_sub.to_csv('submission.csv',index=False)

In [None]:
sample_sub.describe()