In [8]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot  as plt
import torch
import torch.nn as nn
import torch.optim as optimizers
import torch.nn.functional as F 
import torchtext
import glob
import os
import urllib.request
import zipfile
import tarfile
import io
import string
import re
import random

np.random.seed(9837)
torch.manual_seed(9837)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

# データの前処理

## BERTのデータファイルを準備

In [9]:
# データのダウンロード
# フォルダを作成
path = "D:/Statistics/data/deep_leraning/nlp/BERT/"
vocab_path = path + "/vocab/"
weights_path = path + "/weights/"
if not os.path.exists(vocab_path):
    os.mkdir(vocab_path)
if not os.path.exists(weights_path):
    os.mkdir(weights_path)

In [15]:
# 単語、語彙をダウンロード
save_path = path + "/vocab/bert-base-uncased-vocab.txt"
if os.path.isfile(save_path)==False:
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
    urllib.request.urlretrieve(url, save_path)

In [16]:
# BERTの学習済みモデルをダウンロード
# ダウンロード
save_path = path + "/weights/bert-base-uncased.tar.gz"
if os.path.isfile(save_path)==False:
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz"
    urllib.request.urlretrieve(url, save_path)

# 解凍
archive_file = path + "/weights/bert-base-uncased.tar.gz"  # Uncasedは小文字化モードという意味です
tar = tarfile.open(archive_file, 'r:gz')
tar.extractall(weights_path)  # 解凍
tar.close()  # ファイルをクローズ

## IMDbデータセットの準備

In [19]:
# IMDbデータセットをダウンロード
if not os.path.exists(path):
    os.mkdir(path)

url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
save_path = path + "/aclImdb_v1.tar.gz"
if os.path.isfile(save_path)==False:
    urllib.request.urlretrieve(url, save_path)

# tarファイルを読み込み
tar = tarfile.open(path + "/aclImdb_v1.tar.gz")
tar.extractall(path)  # 解凍
tar.close()  # ファイルをクローズ

In [22]:
# IMDbの個別ファイルをtsvにまとめる
target_path = path + "/aclImdb/"

if os.path.exists(target_path):
    
    # 訓練データの作成
    f = open(path + "/IMDb_train.tsv", "w", encoding="utf-8")

    pos_path = path + '/aclImdb/train/pos/'
    for fname in glob.glob(os.path.join(pos_path, '*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()
            
            # タブがあれば消しておきます
            text = text.replace('\t', " ")
            
            text = text+'\t'+'1'+'\t'+'\n'
            f.write(text)

    neg_path = path + "aclImdb/train/neg/"
    for fname in glob.glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()
            
            # タブがあれば消しておきます
            text = text.replace('\t', " ")
            
            text = text+'\t'+'0'+'\t'+'\n'
            f.write(text)

    f.close()
    
    
    # テストデータの作成
    f = open(path + "/IMDb_test.tsv", "w", encoding="utf-8")

    pos_path = path + "/aclImdb/test/pos/"
    for fname in glob.glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()
            
            # タブがあれば消しておきます
            text = text.replace('\t', " ")
        
            text = text+'\t'+'1'+'\t'+'\n'
            f.write(text)

    neg_path = path + "/aclImdb/test/neg/"

    for fname in glob.glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()
            
            
            # タブがあれば消しておきます
            text = text.replace('\t', " ")
            
            text = text+'\t'+'0'+'\t'+'\n'
            f.write(text)


    f.close()

# BERTの実装

## BERTのネットワークの設定ファイルの読み込み

In [28]:
import json
from attrdict import AttrDict

# ファイルを開き、jsonファイルとして読み込む
config_file = path + "/weights/bert_config.json"
json_file = open(config_file, "r")
config = json.load(json_file)
config

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [30]:
# 辞書変数をオブジェクト変数に
config = AttrDict(config)
config.hidden_size

768

## BERT用にLayer Normalization層を定義

In [31]:
# Layer Normalization層を定義
class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(hidden_size))   # weightパラメータ
        self.beta = nn.Parameter(torch.zeors(hidden_size))   # biasパラメータ
        self.variance_epsilon = eps
        
    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.gamma * x + self.beta

## Embeddingsモジュールを定義

In [32]:
# 埋め込み層の実装
class BertEmbeddings(nn.Module):
    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        
        # Token_Embedding: 単語ベクトルを定義
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)

        # Transformer Positional Embedding: 位置情報ベクトルを定義
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        
        # Sentence Embedding: 文章ベクトルの定義
        self.tokey_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        
        # Layer Normalization層
        self.Layernorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        
        # Dropout
        self.dropout(config.hidden_dropout_prob)
        
    def forward(self, input_ids, token_type, type_ids=None):
        # 1. Token Embeddings
        words_embeddings = self.word_embeddings(input_ids)

        # 2. Sentence Embedding
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 3. Transformer Positional Embedding：
        seq_length = input_ids.size(1)  # 文章の長さ
        position_ids = torch.arange(
            seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        # 3つの埋め込みテンソルを足し合わせる [batch_size, seq_len, hidden_size]
        embeddings = words_embeddings + position_embeddings + token_type_embeddings

        # LayerNormalizationとDropoutを実行
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings  

## Self-Attentionモジュール

In [None]:
class BertSelfAttention(nn.Module):
    def __init__(self, config):
        self.num_attention_heads = config.num_attention_heads
        
        self.attention_head_size = int(config.hidden_size / config.num_attention_head) 
        self.all_head_size = self.num_attention_heads * self_attention_head_size
        
        # Self-Attentionの特徴量の全結合層
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
    
    def forward(self, hidden_states, attention_mask, attention_show_flg=False):
        
        # 入力を全結合層で特徴量変換
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        
        # multi-head Attention用にテンソルの形を変換
        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)
        
        # 特徴量同士で積を取りAttention scoreとして求める
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / np.sqrt(self.attention_head_size)
        
        # マスクがある部分にマスクをかける
        attention_scores = attention_scores + attention_mask
        
        # Attentionを正規化
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(atteniton_probs)
        
        # Attention_mapの積を取る
        context_layer = torch.matmul(attention_probs, value_layer)

## BERT Layerモジュール

In [None]:
class BertLayer(nn.Module):
    def __init__(self, config):
        super(BertLayer, self).__init__()
        
        # Self-Attentioin