# 準備

In [1]:
!nvidia-smi

'nvidia-smi' は、内部コマンドまたは外部コマンド、
操作可能なプログラムまたはバッチ ファイルとして認識されていません。


In [86]:
#GPUに渡すための変数device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [87]:
#transformersをインポート
!pip install transformers



In [88]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import copy
import seaborn as sns
import os
import glob
import re
import nltk
nltk.download('punkt')
import collections
import scipy
import math
import gc
import random
import time
import datetime

import torch

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stsuk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [89]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel

In [None]:
#出力フォルダの指定
DATA_DIR = "./"
OUTPUT_DIR = "Output/"
LOG_DIR = "Log/"
MODEL_DIR = "Model/"
PROBA_DIR = "Proba/"
TOKEN_DIR = "Token/"
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#作業ディレクトリの移動
%cd /content/drive/MyDrive/Colab Notebooks/Signate論文コンペ

# データの読み込み

In [90]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [7]:
train_data["text"] = train_data["title"] + " " + train_data["abstract"].fillna("")
test_data["text"] = test_data["title"] + " " + test_data["abstract"].fillna("")
train_data = train_data.drop(["title","abstract"],axis=1)
test_data = test_data.drop(["title","abstract"],axis=1)

## BERT

In [95]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel

In [96]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

## トークン数チェック

In [97]:
max_lens_train = []
i = 0
for x in train_data["text"]:
    max_lens_train.append(len(tokenizer.tokenize(x)))
    max_lens = (len(tokenizer.tokenize(x)))
    if max_lens >= 256:
        i += 1     
        
max_lens_test = []
j = 0
for x in test_data["text"]:
    max_lens_test.append(len(tokenizer.tokenize(x)))
    max_lens = (len(tokenizer.tokenize(x)))
    if max_lens >= 256:
        j += 1

print("最大値：",max(max_lens_train),",trainの文字数512オーバーは",i)
print("最大値：",max(max_lens_test),",testの文字数512オーバーは",j)

最大値： 33 ,trainの文字数512オーバーは 0


# BERTのモデル学習

**ここでやってること**  
・文章をトークンに分ける  
・スペシャルトークン[CLS],[SEP]を追加  
・トークンに番号を付与  
・文章を同じ長さに統一  
・実際のトークンと[PAD]トークンを分けるアテンションマスクを作成

In [98]:
text_list = train_data['text'].tolist()

encoded_dict = tokenizer.batch_encode_plus(
                                            text_list,
                                            add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                                            max_length = 10,               # トークンの最大の長さ
                                            pad_to_max_length = True,       # paddingの方法(512に満たない場合は[PAD]を追加）
                                            return_attention_mask = True,   # attention_mask : 入力トークン(1)とパディングトークン(0)を区別
                                            return_tensors = 'pt'          # Return pytorch tensors.
                                            )

input_ids = encoded_dict.input_ids
attention_masks = encoded_dict.attention_mask
labels = torch.Tensor(train_data['judgement'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


**データの分割を行っている**

In [113]:
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split

# データセットの作成
dataset = TensorDataset(input_ids, attention_masks, labels)

train_dataset, val_dataset = train_test_split(dataset, test_size=0.3)

print("train_dataset Size :", len(train_dataset))
print("val_dataset Size :", len(val_dataset))

train_dataset Size : 7
val_dataset Size : 3


**DataLoaderの作成**

In [121]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_dataloader = DataLoader(
            train_dataset,                              # The training samples.
            sampler = RandomSampler(train_dataset),     # 重複無しでランダムな順番で読み込む
            batch_size = batch_size,                     # Trains with this batch size.
            drop_last=True
        )

validation_dataloader = DataLoader(
            val_dataset,                                # The validation samples.
            sampler = SequentialSampler(val_dataset),   # 重複無しでランダムな順番で読み込む
            batch_size = batch_size)                    # Evaluate with this batch size.

**分類モデルを作成**

In [122]:
from transformers import AutoModelForSequenceClassification, AdamW, BertConfig, BertForSequenceClassification,BertModel

model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    output_attentions = False,          # Whether the model returns attentions weights.
    output_hidden_states = True       # Whether the model returns all hidden-states.
)

model.resize_token_embeddings(len(tokenizer))   #入力トークンのサイズに変更する

model.cuda()    # Tell pytorch to run this model on the GPU.

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSeque

Embedding(28895, 768)

**最適化関数**

In [123]:
optimizer = AdamW(model.parameters(),   # パラメータ
                  lr = 1e-5,            # 学習率(デフォルトは1e-3)
                  eps = 1e-8          # 数値を安定させるため分母に追加する項(デフォルトは1e-8)
                )

**スケジュラーの定義**  
総ステップごとに学習率を変化させていく

In [124]:
#スケーリングの定義
from transformers import get_linear_schedule_with_warmup

epochs = 2

total_steps = len(train_dataloader) * epochs # [トレーニングステップ数] = [ミニバッチ数] × [エポック数]

#ウォームアップ期間が0からオプティマイザーに設定された初期lrまで直線的に増加した後、
# オプティマイザーに設定された初期lrから0に直線的に減少する学習率でスケジュールを作成
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,               # ウォームアップフェーズのステップ数
                                            num_training_steps = total_steps)   #トレーニングステップの総数

**評価関数の定義**

In [125]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import fbeta_score

# Function to calculate the accuracy of our predictions vs labels
Threshold = 0.023

def flat_accuracy(logits, labels):
    pred_flat = np.where(torch.nn.Sigmoid(logits).flatten() < Threshold, 0, 1)
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def F_Beta_Score(logits, labels):
    pred_flat = np.where(torch.nn.Sigmoid(logits).flatten() < Threshold, 0, 1)
    labels_flat = labels.flatten()
    return fbeta_score(pred_flat, labels_flat, beta=7)

**時間計測**

In [126]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

**本番**

In [127]:
#生成される乱数をあらかじめ指定
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


logits_list = []
training_stats = []

# トレーニング時間計測するために現在の時間情報を取得
total_t0 = time.time()

epochs = 2

# For each epoch...
for epoch in range(0, epochs):
    
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0
    
    # モデルをトレーニングモードに
    model.train()

    # For each batch of training data...
    for step, (input_ids, attention_masks, labels) in enumerate(train_dataloader):

        # 100 step 毎に経過を見る
        if step % 100 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        input_ids = input_ids.to(device)               # トークンIDの行列
        attention_masks = attention_masks.to(device)   # attention_maskの行列
        labels = labels.to(device)                     # ラベルデータ
        

        optimizer.zero_grad()   #最適化対象の全ての勾配を初期化（model.zero_grad():モデルの全てのパラメータの勾配を0にする)

        #モデルに当てはめ、出力
        outputs = model(input_ids=input_ids, 
                        attention_mask = attention_masks,
                        token_type_ids=None,
                        labels=labels)
        
        y_preds = torch.nn.Sigmoid(outputs.logits).squeeze()
        
        loss = torch.nn.BCELoss(y_preds, labels)   #損失を出力

        total_train_loss += loss.item() #損失を合計する

        loss.backward() # 勾配(微分)を計算する

        optimizer.step()    # パラメータを更新し、計算された勾配を使ってステップを踏む。オプティマイザは、学習率などに基づいてパラメータをどのように変更するか、「更新ルール」を決定する。　
                            # 勾配や学習率などに基づいて、パラメータをどのように変更するかを決定します。

    scheduler.step()    # 学習率の更新

    avg_train_loss = total_train_loss / len(train_dataloader)  # 全バッチの損失関数の平均を計算
    
    # トレーニング時間計測
    training_time = format_time(time.time() - t0)

    print()
    print("  Average training loss: {0:.5f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               評価
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()  # モデルを評価モードに変更（dropout & batch norm の切り替え）

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_fbeta_score = 0

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for step, (input_ids, attention_masks, labels) in validation_dataloader:
        
        input_ids = input_ids.to(device)               # トークンIDの行列
        attention_masks = attention_masks.to(device)   # attention_maskの行列
        labels = labels.to(device)
        
        with torch.no_grad():        

            outputs = model(input_ids, 
                            attention_mask=attention_masks)
        
        y_preds = torch.nn.Sigmoid(outputs.logits).squeeze()
        
        loss = torch.nn.BCELoss(y_preds, labels)   #損失を出力

        total_eval_loss += loss.item() #損失を合計する
        

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()  # detach:tensor型から勾配情報を抜く cpu():CPUに切り替える
        labels = labels.to('cpu').numpy()


        if epoch_i == epochs-1:
            logits_list.extend(logits)

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, labels)

        total_eval_fbeta_score += F_Beta_Score(logits, labels)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_fbeta_score = total_eval_fbeta_score / len(validation_dataloader)
    print("  Accuracy: {0:.5f}".format(avg_val_accuracy))
    print("  f-bata-score: {0:.5f}".format(avg_val_fbeta_score))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.5f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


ValueError: Target size (torch.Size([2])) must be the same as input size (torch.Size([2, 2]))

In [None]:
#labelの付与(閾値以下であれば0,そうでなければ1を付与)
Threshold = 0.5
pred_flat = np.where(torch.nn.Sigmoid(np.asarray(logits_list)).flatten() < Threshold, 0, 1)

In [None]:
#confusion matrix
answer_labels = []

for i in range(len(val_dataset)):
    answer_labels.append(val_dataset[i][2].to('cpu').numpy())

print(confusion_matrix(np.asarray(answer_labels), pred_flat))

In [None]:
#fbeta_score
fbeta_score(np.asarray(answer_labels), pred_flat, beta=7)

## testデータ

In [None]:
input_ids = []
attention_masks = []

for sent in test_data["text"]:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                        max_length = 256,               # トークンの最大の長さ
                        pad_to_max_length = True,       # paddingの方法(512に満たない場合は[PAD]を追加）
                        return_attention_mask = True,   # attention_mask : 入力トークン(1)とパディングトークン(0)を区別
                        return_tensors = 'pt',          # Return pytorch tensors.
                   )
    
    # 文章をトークンIDで表記したリスト
    input_ids.append(encoded_dict['input_ids'])
    
    # attention_maskのリスト
    attention_masks.append(encoded_dict['attention_mask'])

# 作成したリストを整える
input_ids = torch.cat(input_ids, dim=0)                 # dim=0:文章を縦に積み重ねる dim=1:文章を横に繋げる
attention_masks = torch.cat(attention_masks, dim=0)     # dim=0:文章を縦に積み重ねる dim=1:文章を横に繋げる

# データを確認
print('Original: ', test_data["text"][0])  #元の文章
print('Token IDs:', input_ids[0])           #トークンIDで表記

**データの分割を行っている**

In [None]:
from torch.utils.data import TensorDataset, random_split

# データセットの作成
test_dataset = TensorDataset(input_ids, attention_masks)

print('{:>5,} test samples'.format(len(test_dataset)))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# ミニバッチのサイズを設定(16,32が推奨されている)
batch_size =  8   #8以上だと動かない

# DataLoaderを作成
test_dataloader = DataLoader(
            test_dataset,                              # The training samples.
            sampler = RandomSampler(train_dataset),     # 重複無しでランダムな順番で読み込む
            batch_size = batch_size,                    # Trains with this batch size
            shuffle=False.
        )

In [None]:
#生成される乱数をあらかじめ指定
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

test_logits_list = []
training_stats = []

# ========================================
#               評価
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.

print("")
print("Running Validation...")

t0 = time.time()

model.eval()  # モデルを評価モードに変更（dropout & batch norm の切り替え）

# Evaluate data for one epoch
for batch in test_dataloader:
    
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    
    with torch.no_grad():        

        outputs = model(b_input_ids, 
                                #token_type_ids=None, 
                                attention_mask=b_input_mask)

    logits = outputs.logits # 最後の活性化関数に通す前の値

    test_logits_lst.append(logits)

In [None]:
predict = pd.Series(model.predict_proba(test)[:,1]).apply(pred)
predict.index=range(27145,67979)
predict.to_csv('submit.csv',header=None)