# 準備

In [None]:
#GPUを確認
!nvidia-smi

In [None]:
#transformersをインポート
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import copy
import seaborn as sns
import os
import glob
import re
import nltk
nltk.download('punkt')
import collections
import scipy
import math
import gc
import random
import time
import datetime
import torch
import warnings
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, confusion_matrix, fbeta_score
from tqdm.notebook import tqdm


In [None]:
#GPUに渡すための変数device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#出力フォルダの指定
DATA_DIR = "./"
OUTPUT_DIR = "Output/"
warnings.filterwarnings("ignore")

In [None]:
#GoogleDriveのマウント
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#作業ディレクトリの移動
%cd /content/drive/MyDrive/Colab Notebooks/Signate論文コンペ

# データの読み込み

In [None]:
train_data = pd.read_csv(DATA_DIR + "train.csv")
test_data = pd.read_csv(DATA_DIR + "test.csv")

In [None]:
train_data.head(10)

In [None]:
test_data.head(10)

## titleとabstractを結合

In [None]:
train_data["text"] = train_data["title"] + " " + train_data["abstract"].fillna("")
test_data["text"] = test_data["title"] + " " + test_data["abstract"].fillna("")
train_data = train_data.drop(["title","abstract"],axis=1)
test_data = test_data.drop(["title","abstract"],axis=1)

In [None]:
train_data.head(10)

In [None]:
test_data.head(10)

# データの前理解

## 事前学習モデルのトークンを調べる

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

In [None]:
tokenizer.vocab

## 512を超える文章の数を調べる

In [None]:
max_lens_train = []
i = 0
for x in train_data["text"]:
    max_lens_train.append(len(tokenizer.tokenize(x)))
    max_lens = (len(tokenizer.tokenize(x)))
    if max_lens >= 256:
        i += 1     
        
max_lens_test = []
j = 0
for x in test_data["text"]:
    max_lens_test.append(len(tokenizer.tokenize(x)))
    max_lens = (len(tokenizer.tokenize(x)))
    if max_lens >= 256:
        j += 1

print("最大値：",max(max_lens_train),",trainの文字数512オーバーは",i)
print("最大値：",max(max_lens_test),",testの文字数512オーバーは",j)

## 文書の長さの分布

In [None]:
# 分布を出力
fig,(ax1,ax2)=plt.subplots(2,1,figsize=(15,7))

sns.histplot(max_lens_train, ax = ax1, label = "train")
ax1.legend(loc="lower right",fontsize=30)
ax1.set_xlabel(xlabel="lens")
ax1.grid()
ax1.set_title("train",fontsize=10)
ax1.set_xlim(1,800)
ax1.legend(loc='upper right',fontsize=10)

sns.histplot(max_lens_test, ax = ax2, label = "test")
ax2.legend(loc="lower right",fontsize=20)
ax2.set_xlabel(xlabel="lens")
ax2.grid()
ax2.set_title("train",fontsize=10)
ax2.set_xlim(1,800)
ax2.legend(loc='upper right',fontsize=10)

# BERTのモデル学習

## input_idsとAttentionの取得
- 文章をトークンに分ける  
- スペシャルトークン[CLS],[SEP]を追加  
- トークンにIDを付与(input_ids)
- Attentionを付与(attention_masks)
- 文章を同じ長さ(max_length)に統一するために、切り捨てor[PAD]を追加

In [None]:
text_list = train_data['text'].tolist()

encoded_dict = tokenizer.batch_encode_plus(
                                            text_list,
                                            add_special_tokens = True,      # [CLS]と[SEP]を追加
                                            max_length = 10,                # 最大トークン数
                                            truncation=True,                # 切り捨て
                                            pad_to_max_length = True,       # paddingの方法(512に満たない場合は[PAD]を追加）
                                            return_attention_mask = True,   # attention_mask : 入力トークン(1)とパディングトークン(0)を区別
                                            return_tensors = 'pt'           # pytorchのテンソル型に渡す
                                            )

input_ids = encoded_dict.input_ids
attention_masks = encoded_dict.attention_mask
labels = torch.Tensor(train_data['judgement'])

## 学習用データと検証用データに分割

In [None]:
# データセットの作成
dataset = TensorDataset(input_ids, attention_masks, labels)

train_dataset, val_dataset = train_test_split(dataset, test_size=0.3)

print("train_dataset Size :", len(train_dataset))
print("val_dataset Size :", len(val_dataset))

## DataLoaderの作成<br>
batch_size毎にデータをモデルに学習させる

In [None]:
batch_size = 16

train_dataloader = DataLoader(
                            train_dataset,                              # The training samples.
                            sampler = RandomSampler(train_dataset),     # 重複無しでランダムな順番で読み込む(shuffle=Trueでも同じ)
                            batch_size = batch_size,                    # Trains with this batch size.
                            drop_last=True                              # batchサイズで割り切れなかった分は切り捨てる
                            )

validation_dataloader = DataLoader(
                                    val_dataset,                                # The validation samples.
                                    sampler = SequentialSampler(val_dataset),   # 重複無しで順番で読み込む(shuffle=Falseでも同じ)
                                    batch_size = batch_size)                    # Evaluate with this batch size.

In [None]:
print("train_dataLoader Size:", len(train_dataloader))
print(train_dataloader.dataset[0])
print()
print("validation_dataloader Size:", len(validation_dataloader))
print(validation_dataloader.dataset[0])

## モデルの定義

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
                                                            "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
                                                            num_labels=1,
                                                            output_attentions = False,                              # False:attentionは出力しない
                                                            output_hidden_states = True                             # True:隠れ層を出力
                                                        )
model.resize_token_embeddings(len(tokenizer))   #入力トークンのサイズに変更する
model.cuda()    # GPUに渡す

model   #modelの構造を確認

## 最適化関数の定義

In [None]:
optimizer = AdamW(model.parameters(),   # パラメータ
                  lr = 2e-5,            # 学習率(デフォルトは1e-3)
                  eps = 1e-8            # 数値を安定させるため分母に追加する項(デフォルトは1e-8)
                )

## スケジュラーの定義
総ステップごとに学習率を変化させていく

In [None]:
epochs = 5

total_steps = len(train_dataloader) * epochs # [トレーニングステップ数] = [ミニバッチ数] × [エポック数]

#ウォームアップ期間が0からオプティマイザーに設定された初期lrまで直線的に増加した後、オプティマイザーに設定された初期lrから0に直線的に減少する学習率でスケジュールを作成
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,               # ウォームアップフェーズのステップ数
                                            num_training_steps = total_steps)   #トレーニングステップの総数

## その他関数

**時間計測**

In [None]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

**評価関数の定義**

In [None]:
   def BestScore(labels, prediction):

        Threshold_list = []
        # accuracy_score_list = []
        # recall_score_list = []
        # precision_score_list = []
        fbeta_score_list = []
        confusion_matrix_list = []
        classification_report_list = []
        
        for i in range(1000):
            Threshold = i*0.0001
            predictions = np.where(prediction < Threshold, 0, 1)
            Threshold_list.append(Threshold)

            #accuracy_score_list.append(accuracy_score(labels, predictions)
            #recall_score_list.append(recall_score(labels, predictions))
            #precision_score_list.append(precision_score(labels, predictions))
            fbeta_score_list.append(fbeta_score(labels, predictions, beta=7.0))
            confusion_matrix_list.append(confusion_matrix(labels, predictions))
            classification_report_list.append(classification_report(labels, predictions))
                    
        for Threshold, Fbeta_score, Confusion_Matrix, Classification_Report in zip(Threshold_list, fbeta_score_list, confusion_matrix_list, classification_report_list):
            if Fbeta_score == max(fbeta_score_list):
                return Threshold, Fbeta_score, Confusion_Matrix, Classification_Report
                break

## 学習本番

In [None]:
#生成される乱数をあらかじめ指定
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


Sigmoid = torch.nn.Sigmoid()
Criterion = torch.nn.BCELoss()

logits_list = []
training_stats = []

best_val_loss = np.inf

# トレーニング時間計測するために現在の時間情報を取得
total_t0 = time.time()

epochs = 5

for epoch in range(0, epochs):
    
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    t0 = time.time()
    step_train_loss = 0
    total_train_loss = 0
    step_val_loss = 0
    total_val_loss = 0
    
    model.train()

    for step, (input_ids, attention_masks, labels) in enumerate(train_dataloader):

        input_ids = input_ids.to(device)               # トークンIDの行列
        attention_masks = attention_masks.to(device)   # attention_maskの行列
        labels = labels.to(device)                     # ラベルデータ
        
        optimizer.zero_grad()   #最適化対象の全ての勾配を初期化

        outputs = model(input_ids=input_ids, 
                        attention_mask = attention_masks)

        y_proba = Sigmoid(outputs.logits).squeeze()
        
        loss = Criterion(y_proba, labels)               #損失を出力
        step_train_loss += loss.item()*batch_size     #step毎の損失
        loss.backward()                                 #勾配(微分)を計算する

        optimizer.step()    # パラメータを更新し、計算された勾配を使ってステップを踏む。オプティマイザは、学習率などに基づいてパラメータをどのように変更するか、「更新ルール」を決定する。　
                            # 勾配や学習率などに基づいて、パラメータをどのように変更するかを決定します。

        if (step % 100 == 0 and not step==0) or step==(len(train_dataloader)-1):
            elapsed = format_time(time.time() - t0)
            total_train_loss = step_train_loss/step
            print('  【Batch {:>5,}  of  {:>5,}    Elapsed: {:}】       Train Loss {:.5f}'.format(step, len(train_dataloader)-1, elapsed, total_train_loss))

    scheduler.step()    # 学習率の更新
    
    # トレーニング時間計測
    training_time = format_time(time.time() - t0)

    print()
    print("  Average training loss: {0:.5f}".format(total_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               評価
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()  # モデルを評価モードに変更（dropout & batch norm の切り替え）

    total_eval_accuracy = 0
    total_eval_fbeta_score = 0

    total_eval_loss = 0
    nb_eval_steps = 0

    y_proba_list = []

    for step, (input_ids, attention_masks, labels) in enumerate(validation_dataloader):
        
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():        
            outputs = model(input_ids, 
                            attention_mask=attention_masks)
        
        y_proba = Sigmoid(outputs.logits).squeeze()
        
        loss = Criterion(y_proba, labels)    #損失を出力
        
        step_val_loss += loss.item()*batch_size      #step毎の損失
        
        if  (step % 100 == 0 and not step==0) or step==(len(validation_dataloader)-1):
            elapsed = format_time(time.time() - t0)
            total_val_loss = step_val_loss/step
            print('  【Batch {:>5,}  of  {:>5,}    Elapsed: {:}】       Validation Loss {:.5f}'.format(step, len(validation_dataloader)-1, elapsed, total_val_loss))
             
        logits = outputs.logits.detach().cpu().numpy()  # detach:tensor型から勾配情報を抜く cpu():CPUに切り替える
        labels = labels.to('cpu').numpy()
        
        y_proba_list.append(y_proba.to("cpu").numpy())


    prediction = np.concatenate(y_proba_list)
    labels = np.array([i[2] for i in val_dataset])
    
    Threshold, Fbeta_Score, Confusion_Matrix, Classification_Report = BestScore(labels, prediction)
    
    
    validation_time = format_time(time.time() - t0)
    print()
    print("  <Finished>")
    print("  Validation Loss: {0:.5f}".format(total_val_loss))
    print("  Validation took: {:}".format(validation_time))
    print()
    print(   "<Result>")
    print("  Threshold: {0:.5f}".format(Threshold))
    print("  f-bata-score: {0:.5f}".format(Fbeta_Score))
    print("  confusion_matrix:")
    print(Confusion_Matrix)
    print("  classification_report:")
    print(Classification_Report)

    if total_val_loss < best_val_loss:
        best_stats = []
        best_epoch = epoch+1
        best_val_score = total_val_loss
        best_score = Fbeta_Score
        best_Threshold = Threshold
              
        best_stats.append(
                        {
                            "epoch":best_epoch,
                            "best_val_score":best_val_score,
                            "best_score":best_score,
                            "best_Threshold":best_Threshold
                        }
                        )

    training_stats.append(
                        {
                        'epoch': epoch + 1,
                        'Training Loss': total_train_loss,
                        'Valid. Loss': total_val_loss,
                        'Valid. FbetaScore.': Fbeta_Score,
                        'Training Time': training_time,
                        'Validation Time': validation_time
                        }
                        )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

## 学習結果

**スタッツ**

In [None]:
training_stats

**Bestスタッツ**

In [None]:
best_stats

# 作成したBERTモデルでtestデータを予測

## input_idsとAttentionの取得

In [None]:
text_list = test_data['text'].tolist()

encoded_dict = tokenizer.batch_encode_plus(
                                            text_list,
                                            add_special_tokens = True,      # [CLS]と[SEP]を追加
                                            max_length = 10,                # 最大トークン数
                                            pad_to_max_length = True,       # paddingの方法(512に満たない場合は[PAD]を追加）
                                            return_attention_mask = True,   # attention_mask : 入力トークン(1)とパディングトークン(0)を区別
                                            return_tensors = 'pt'           # pytorchのテンソル型に渡す
                                            )

input_ids = encoded_dict.input_ids
attention_masks = encoded_dict.attention_mask

## Datasetの作成

In [None]:
test_dataset = TensorDataset(input_ids, attention_masks)

print('{:>5,} test dataset'.format(len(test_dataset)))

## DataLoaderの作成

In [None]:
batch_size = 16   #8以上だと動かない

test_dataloader = DataLoader(
                            test_dataset,                               # The training samples.
                            batch_size = batch_size,                    # Trains with this batch size
                            shuffle=False
                            )

## testデータを学習

In [None]:
#生成される乱数をあらかじめ指定
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Sigmoid = torch.nn.Sigmoid()

proba_list = []

# ========================================
#               評価
# ========================================
print("")
print("Running test...")

t0 = time.time()

model.eval()  # モデルを評価モードに変更（dropout & batch norm の切り替え）

# Evaluate data for one epoch
for step, (input_ids, attention_masks) in enumerate(test_dataloader):
    
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)
    
    with torch.no_grad():        
        outputs = model(input_ids, 
                        attention_mask=attention_masks)

    logits = outputs.logits
    proba = Sigmoid(logits).squeeze()
    proba_list.append(proba.to("cpu").numpy())

    if  (step % 100 == 0 and not step==0) or step==(len(test_dataloader)-1):
        elapsed = format_time(time.time() - t0)
        print('  【Batch {:>5,}  of  {:>5,}    Elapsed: {:}】'.format(step, len(test_dataloader)-1, elapsed))

proba = np.concatenate(proba_list)
print("Finished!!")

**確率**

In [None]:
proba

**ラベル付け**

In [None]:
prediction = np.where(proba<0.0004,0,1)

## 提出

In [None]:
submission = pd.Series(prediction)
submission.index=range(27145,67979)
#submission.to_csv(OUTPUT_DIR + 'submission.csv',header=None)

In [None]:
submission