In [2]:
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm

from glob import glob
from tensorflow.keras.utils import Progbar

LANG_CODE = "en" #@param {type:"string"}

regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
  # lowercase text
  text = str(text).lower()
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  # remove punktuation symbols
  text = " ".join(regex_tokenizer.tokenize(text))
  return text

def count_lines(filename):
  count = 0
  with open(filename) as fi:
    for line in fi:
      count += 1
  return count

In [3]:
import re
odata_path = ".\pretrain_data_format\original.txt"
file_object  = open(odata_path, "r+")
odata_contents = file_object.read()
file_object.close()
odata_contents = re.sub("START_OF_RECORD=.+||||","",odata_contents)
odata_new = re.sub("||||END_OF_RECORD","",odata_contents)
file = open(".\pretrain_data_format\dataset.txt","w") 
file.write(odata_new)
file.close()

In [4]:
RAW_DATA_FPATH = ".\pretrain_data_format\dataset.txt" #@param {type: "string"}
PRC_DATA_FPATH = ".\pretrain_data_format\proc_dataset.txt" #@param {type: "string"}

# apply normalization to the dataset
# this will take a minute or two

total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      bar.add(1)



In [5]:
MODEL_PREFIX = "tokenizer" #@param {type: "string"}
VOC_SIZE = 7000 #@param {type:"integer"}
SUBSAMPLE_SIZE = 12800000 #@param {type:"integer"}
NUM_PLACEHOLDERS = 256 #@param {type:"integer"}

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

True

In [6]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

Learnt vocab size: 6743
Sample tokens: ['rated', '▁dress', '▁proxy', '▁stimuli', '▁pad', '▁limit', '▁premed', '▁tips', '▁deep', 'riction']


In [7]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token
        
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

7000


In [8]:
VOC_FNAME = "vocab.txt" #@param {type:"string"}

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [9]:
sys.path.append("bert")
from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder
testcase = " Essentially , Mr. Cornea is a 60 year old male who noted the onset of dark urine during early January . He underwent CT and ERCP at the Lisonatemi Faylandsburgnic, Community Hospital with a stent placement and resolution of jaundice . He underwent an ECHO and endoscopy at Ingree and Ot of Weamanshy Medical Center on April 28."
bert_tokenizer = tokenization.FullTokenizer(VOC_FNAME)
bert_tokenizer.tokenize(testcase)

['essent',
 '##ial',
 '##ly',
 '[UNK]',
 'mr',
 '[UNK]',
 'cor',
 '##ne',
 '##a',
 'is',
 'a',
 '60',
 'year',
 'old',
 'mal',
 '##e',
 'who',
 'noted',
 'the',
 'on',
 '##set',
 'of',
 'dark',
 'urine',
 'during',
 'early',
 '[UNK]',
 '[UNK]',
 'he',
 'under',
 '##went',
 'ct',
 'and',
 'ercp',
 'at',
 'the',
 'lis',
 '##ona',
 '##tem',
 '##i',
 'fa',
 '##yl',
 '##and',
 '##sb',
 '##urg',
 '##nic',
 '[UNK]',
 'commun',
 '##ity',
 'hospital',
 'with',
 'a',
 'stent',
 'placement',
 'and',
 'resolution',
 'of',
 '[UNK]',
 '[UNK]',
 'he',
 'under',
 '##went',
 'an',
 'echo',
 'and',
 'endoscop',
 '##y',
 'at',
 'in',
 '##g',
 '##re',
 '##e',
 'and',
 'ot',
 'of',
 'wea',
 '##man',
 '##sh',
 '##y',
 'medical',
 'center',
 'on',
 'ap',
 '##r',
 '##il',
 '28',
 '[UNK]']

Vocab File is created and we can now make the pre-training data with create_pretraining_data.py

In [10]:
MAX_SEQ_LENGTH = 50 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = True #@param {type:"boolean"}

PRETRAINING_DIR = "pretraining_data_dir" #@param {type:"string"}

In [12]:
tf.gfile.MkDir(PRETRAINING_DIR)

In [14]:
MODEL_DIR = "from_scratch_BERT"
tf.gfile.MkDir(MODEL_DIR)

In [15]:
# use this for BERT-base

bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
}

with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
  json.dump(bert_base_config, fo, indent=2)
  
with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")