# Bert Model A

## ライブラリロード

In [1]:
!pip install sentencepiece
!pip install pandas



In [2]:
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert import albert_tokenization
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
from tensorflow import keras
import pprint
import sentencepiece as spm

## Story Name

In [3]:
story_names = ["ACaseOfIdentity_formatted_S2",
                 "CrookedMan_formatted_S2",
                 "DancingMen_formatted_S2",
                 "DevilsFoot_formatted_S2",
                 "SpeckledBand_formatted_S2"]

# Vocab.txt作成

vocab_file = "./vocab.txt"

words = {}
for story_name in story_names:
    data_file_name = story_name + ".txt"

    with open(data_file_name, "r") as f:
        for line in f:
            line = line.replace(".","")
            tokens = line.rstrip("\n").split(",")
            for token in tokens:
                for key in token.split(" "):
                    val = words.get(key)
                    if (val == None):
                        words[key] = 1
                    else:
                        words[key] = val + 1

with open(vocab_file, "w") as f:
    f.write("[PAD]\n")
    f.write("[UNK]\n")
    f.write("[CLS]\n")
    f.write("[SEP]\n")
    f.write("[MASK]\n")

    for key in words:
        f.write(key)
        f.write("\n")

#print(words)

## Preparing the Data

In [5]:
class SherlockModelData:
    
    def __init__(self, sp: spm.SentencePieceProcessor, data_file_name:str, pattern:str, sample_size=None, lower:bool=True):
        self.sp = sp
        self.sample_size = sample_size
        self.max_seq_len = 0
        self.lower = lower
        
        if pattern == 'p3':
            (t, self.label) = self._prepareP3(self._load_data(data_file_name))
            (self.train, self.train_type) = self._pad(t)
        elif pattern == 'p2':
            (t, self.label) = self._prepareP2(self._load_data(data_file_name))
            (self.train, self.train_type) = self._pad(t)
        else:
            (t, self.label) = self._prepareP1(self._load_data(data_file_name))
            (self.train, self.train_type) = self._pad(t)
        
    def _load_data(self, data_file_name:str):
        x = {}
        f = open(data_file_name, "r")
        for line in f:
            line = line.replace(".","")
            tokens = line.rstrip("\n").split(",")
            key = tokens[0]
            val = x.get(key)
            if (val == None):
                x[key] = [tokens]
            else:
                val.append(tokens)
        f.close()
        return x

    def _prepareP3(self,data):
        x, y = [], []
        for key in data:
            tokens = ["[CLS]"]
            sentence = []
            
            w = bert.albert_tokenization.preprocess_text(key, lower=self.lower)
            tokens.append(w)
            sentence.append(key)
            for line in data[key]:
                tokens.append("[SEP]")
                w = bert.albert_tokenization.preprocess_text(line[1].lower() + " " + line[2], lower=self.lower)
                tokens.append(w)
                sentence.append(line[1].lower())
                sentence.append(line[2])
            tokens.append("[SEP]")
    
            #print(''.join(tokens))
            token_ids = bert.albert_tokenization.encode_ids(self.sp, ''.join(tokens))
            #print(token_ids)
            self.max_seq_len = max(self.max_seq_len, len(token_ids))
            x.append(token_ids)
            y.append(np.array(sentence))
        return np.array(x), np.array(y)

    def _prepareP2(self,data):
        x, y = [], []
        for key in data:
            for line in data[key]:
                tokens = ["[CLS]"]
                w = bert.albert_tokenization.preprocess_text(key, lower=self.lower)
                tokens.append(w)
                tokens.append("[SEP]")
                w = bert.albert_tokenization.preprocess_text(line[1].lower(), lower=self.lower)
                tokens.append(w)                
                tokens.append("[SEP]")
                w = bert.albert_tokenization.preprocess_text(line[2], lower=self.lower)
                tokens.append(w)
                tokens.append("[SEP]")

                sentence = []
                sentence.append(key)
                sentence.append(line[1].lower())
                sentence.append(line[2])

                #print(''.join(tokens))
                token_ids = bert.albert_tokenization.encode_ids(self.sp, ''.join(tokens))
                #print(token_ids)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x.append(token_ids)
                y.append(np.array(sentence))
        return np.array(x), np.array(y)

    
    def _prepareP1(self,data):
        x, y = [], []
        for key in data:
            for line in data[key]:
                tokens = ["[CLS]"]
                w = bert.albert_tokenization.preprocess_text(key, lower=self.lower)
                tokens.append(w)                
                tokens.append("[SEP]")
                w = bert.albert_tokenization.preprocess_text(line[1].lower() + " " + line[2], lower=self.lower)
                tokens.append(w)
                tokens.append("[SEP]")

                sentence = []
                sentence.append(key)
                sentence.append(line[1].lower())
                sentence.append(line[2])

                #print(''.join(tokens))
                token_ids = bert.albert_tokenization.encode_ids(self.sp, ''.join(tokens))
                #print(token_ids)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x.append(token_ids)
                y.append(np.array(sentence))
        return np.array(x), np.array(y)
        
    def _pad(self, ids):
        x, t = [], []
        token_type_ids = [0] * self.max_seq_len
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
            t.append(token_type_ids)
        return np.array(x), np.array(t)


# BERT モデル

bert_model_dir   = "2018_10_18"
bert_ckpt_dir    = bert_model_dir + "/cased_L-24_H-1024_A-16"
bert_ckpt_file   = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
bert_model_name  = os.path.basename(os.path.dirname(bert_ckpt_file))

print(bert_model_name)

## BERT TEST

tokenizer = bert_tokenization.FullTokenizer(vocab_file=vocab_file)
bert_params = bert.params_from_pretrained_ckpt(bert_ckpt_dir)
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")

story_name = "ACaseOfIdentity_formatted_S2"
pattern='p3'
model_file_name = story_name + "-bert-" + pattern + ".npz"
data_file_name = story_name + ".txt"
modelData = SherlockModelData(tokenizer=tokenizer,data_file_name=data_file_name,pattern=pattern)
print(modelData.label)

l_input_ids = keras.layers.Input(shape=(modelData.max_seq_len,), dtype='int32')
output = l_bert(l_input_ids)                              # output: [batch_size, max_seq_len, hidden_size]
model = keras.Model(inputs=l_input_ids, outputs=output)
model.build(input_shape=(None, modelData.max_seq_len))
bert.load_bert_weights(l_bert, bert_ckpt_file)
model.summary()

vector = model.predict(modelData.train)

print(vector.shape)
np.savez(model_file_name,vector,modelData.label)
npz = np.load(model_file_name,allow_pickle=True)
print(npz['arr_0'])
print(npz['arr_1'])

# BERT ベクトルデータ生成

pattern = 'p3'
for story_name in story_names:
    model_file_name = story_name + "-bert-" + pattern + ".npz"
    data_file_name = story_name + ".txt"

    print("================>" + story_name)
    tokenizer = bert_tokenization.FullTokenizer(vocab_file=vocab_file)
    bert_params = bert.params_from_pretrained_ckpt(bert_ckpt_dir)
    l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")
    modelData = SherlockModelData(tokenizer=tokenizer,data_file_name=data_file_name, pattern=pattern)

    print(modelData.train)
    l_input_ids = keras.layers.Input(shape=(modelData.max_seq_len,), dtype='int32')
    output = l_bert(l_input_ids)
    model = keras.Model(inputs=l_input_ids, outputs=output)
    model.build(input_shape=(None, modelData.max_seq_len))
    bert.load_bert_weights(l_bert, bert_ckpt_file)
    model.summary()
    
    vector = model.predict(modelData.train)
    np.savez(model_file_name,vector,modelData.label)
    npz = np.load(model_file_name,allow_pickle=True)
    print(npz['arr_0'])
    print(npz['arr_1'])

# ALBERT MODEL

In [6]:
model_name = "albert_xxlarge"
bert_ckpt_dir = bert.fetch_tfhub_albert_model(model_name, ".models")
print(bert_ckpt_dir)

Fetching ALBERT model: albert_xxlarge version: 2
Already  fetched:  albert_xxlarge.tar.gz
already unpacked at: .models/albert_xxlarge
.models/albert_xxlarge


## ALBERT TEST

spm_model = os.path.join(bert_ckpt_dir, "assets", "30k-clean.model")
sp = spm.SentencePieceProcessor()
sp.load(spm_model)
do_lower_case = True

story_name = "ACaseOfIdentity_formatted_S2"
pattern='p1'
data_file_name = story_name + ".txt"
modelData = SherlockModelData(sp=sp,data_file_name=data_file_name,pattern=pattern,lower=do_lower_case)

model_file_name = story_name + "-" + model_name + "-" + pattern + ".npz"
model_params = bert.albert_params(model_name)
l_bert = bert.BertModelLayer.from_params(model_params, name="albert")

l_input_ids = keras.layers.Input(shape=(modelData.max_seq_len,), dtype='int32')
output = l_bert(l_input_ids)                              # output: [batch_size, max_seq_len, hidden_size]
model = keras.Model(inputs=l_input_ids, outputs=output)
model.build(input_shape=(None, modelData.max_seq_len))
bert.load_albert_weights(l_bert, bert_ckpt_dir)
model.summary()

vector = model.predict(modelData.train)

print(vector.shape)
np.savez(model_file_name,vector,modelData.label)
npz = np.load(model_file_name,allow_pickle=True)
print(npz['arr_0'])
print(npz['arr_1'])

In [7]:
pattern = 'p2'
for story_name in story_names:
    model_file_name = story_name + "-" + model_name + "-" + pattern + ".npz"
    data_file_name = story_name + ".txt"
    
    print("================> " + story_name)

    spm_model = os.path.join(bert_ckpt_dir, "assets", "30k-clean.model")
    sp = spm.SentencePieceProcessor()
    sp.load(spm_model)
    do_lower_case = True

    modelData = SherlockModelData(sp=sp,data_file_name=data_file_name, pattern=pattern,lower=do_lower_case)
    print(modelData.train)
    
    model_params = bert.albert_params(model_name)
    l_bert = bert.BertModelLayer.from_params(model_params, name="albert")
    l_input_ids = keras.layers.Input(shape=(modelData.max_seq_len,), dtype='int32')
    output = l_bert(l_input_ids)
    model = keras.Model(inputs=l_input_ids, outputs=output)
    model.build(input_shape=(None, modelData.max_seq_len))
    bert.load_albert_weights(l_bert, bert_ckpt_dir)
    model.summary()
    
    vector = model.predict(modelData.train)
    np.savez(model_file_name,vector,modelData.label)
    #npz = np.load(model_file_name,allow_pickle=True)
    #print(npz['arr_0'])
    #print(npz['arr_1'])

[[636   1 500 ...   0   0   0]
 [636   1 500 ...   0   0   0]
 [636   1 500 ...   0   0   0]
 ...
 [636   1 500 ...   0   0   0]
 [636   1 500 ...   0   0   0]
 [636   1 500 ...   0   0   0]]
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Done loading 22 BERT weights from: .models/albert_xxlarge into <bert.model.BertModelLayer object at 0x7f05a8fe7a90> (prefix:albert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from saved model: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(No