<a href="https://colab.research.google.com/github/sahandv/scioscipred/blob/master/BERT_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT WORD EMBEDDING

Credits to https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b

In [1]:
!pwd
import sys
import os
print('env:',sys.executable)
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'

/home/sahand/Projects/science_science
env: /home/sahand/anaconda3/envs/tf-1/bin/python


### Mount Google Drive
If on colab:

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!ls 'drive/My Drive/Data/1990-2018 corpus sentences abstract-title'

### Install requirements

In [None]:
!rm -rf bert
!git clone https://github.com/google-research/bert

In [None]:
!pip install tqdm

### Import requirements

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import time
import codecs
import collections
import json
import re

import pprint
import numpy as np
import tensorflow as tf

from BERT import modeling
from BERT import tokenization

import pandas as pd
from tqdm import tqdm

If on colab:

In [None]:
from google.colab import files
sys.path.append('bert/')

### Make sure colab TPU is being used
If on colab:

In [None]:
assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

In [None]:
from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

### Select which BERT model we want to use

In [4]:
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'scibert_scivocab_uncased'
BERT_PRETRAINED_DIR = '/home/sahand/Projects/science_science/BERT_weights/' + BERT_MODEL
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

***** BERT pretrained directory: /home/sahand/Projects/science_science/BERT_weights/scibert_scivocab_uncased *****
/bin/bash: gsutil: command not found


### Define some global parameters

In [5]:
LAYERS = [-1,-2,-3,-4]
NUM_TPU_CORES = 272
MAX_SEQ_LENGTH = 500
BERT_CONFIG = BERT_PRETRAINED_DIR + '/bert_config.json'
CHKPT_DIR = BERT_PRETRAINED_DIR + '/bert_model.ckpt'
VOCAB_FILE = BERT_PRETRAINED_DIR + '/vocab.txt'
INIT_CHECKPOINT = BERT_PRETRAINED_DIR + '/bert_model.ckpt'
BATCH_SIZE = 64

### Define wrapper classes for the input before processing and after processing

In [6]:
class InputExample(object):

  def __init__(self, unique_id, text_a, text_b=None):
    self.unique_id = unique_id
    self.text_a = text_a
    self.text_b = text_b
    
class InputFeatures(object):
  """A single set of features of data."""

  def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
    self.unique_id = unique_id
    self.tokens = tokens
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.input_type_ids = input_type_ids

### Set up our Estimator

The estimator is a wrapper around the BertModel

In [7]:
def input_fn_builder(features, seq_length):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_unique_ids = []
  all_input_ids = []
  all_input_mask = []
  all_input_type_ids = []

  for feature in features:
    all_unique_ids.append(feature.unique_id)
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_input_type_ids.append(feature.input_type_ids)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)

    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "unique_ids":
            tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_type_ids":
            tf.constant(
                all_input_type_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
    })

    d = d.batch(batch_size=batch_size, drop_remainder=False)
    return d

  return input_fn
  
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
                     use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    unique_ids = features["unique_ids"]
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    input_type_ids = features["input_type_ids"]

    # The model is defined here
    model = modeling.BertModel(
        config=bert_config,
        is_training=False,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=input_type_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    if mode != tf.estimator.ModeKeys.PREDICT:
      raise ValueError("Only PREDICT modes are supported: %s" % (mode))

    tvars = tf.trainable_variables()
    scaffold_fn = None
    (assignment_map,
     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
         tvars, init_checkpoint)
    if use_tpu:

      def tpu_scaffold():
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
        return tf.train.Scaffold()

      scaffold_fn = tpu_scaffold
    else:
      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    all_layers = model.get_all_encoder_layers()

    predictions = {
        "unique_id": unique_ids,
    }

    # Tredictions from the top 4 layers are extracted here
    for (i, layer_index) in enumerate(layer_indexes):
      predictions["layer_output_%d" % i] = all_layers[layer_index]

    output_spec = tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn

### Transform InputExamples to InputFeatures

In [8]:
def convert_examples_to_features(examples, seq_length, tokenizer):
  """Loads a data file into a list of `InputBatch`s."""

  features = []
  for (ex_index, example) in enumerate(examples):
    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
      tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
      # Modifies `tokens_a` and `tokens_b` in place so that the total
      # length is less than the specified length.
      # Account for [CLS], [SEP], [SEP] with "- 3"
      _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
    else:
      # Account for [CLS] and [SEP] with "- 2"
      if len(tokens_a) > seq_length - 2:
        tokens_a = tokens_a[0:(seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    input_type_ids = []
    tokens.append("[CLS]")
    input_type_ids.append(0)
    for token in tokens_a:
      tokens.append(token)
      input_type_ids.append(0)
    tokens.append("[SEP]")
    input_type_ids.append(0)

    if tokens_b:
      for token in tokens_b:
        tokens.append(token)
        input_type_ids.append(1)
      tokens.append("[SEP]")
      input_type_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < seq_length:
      input_ids.append(0)
      input_mask.append(0)
      input_type_ids.append(0)

    assert len(input_ids) == seq_length
    assert len(input_mask) == seq_length
    assert len(input_type_ids) == seq_length

    if ex_index < 5:
      tf.logging.info("*** Example ***")
      tf.logging.info("unique_id: %s" % (example.unique_id))
      tf.logging.info("tokens: %s" % " ".join(
          [tokenization.printable_text(x) for x in tokens]))
      tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
      tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
      tf.logging.info(
          "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))

    features.append(
        InputFeatures(
            unique_id=example.unique_id,
            tokens=tokens,
            input_ids=input_ids,
            input_mask=input_mask,
            input_type_ids=input_type_ids))
  return features

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  """Truncates a sequence pair in place to the maximum length."""

  # This is a simple heuristic which will always truncate the longer sequence
  # one token at a time. This makes more sense than truncating an equal percent
  # of tokens from each, since if one sequence is very short then each token
  # that's truncated likely contains more information than a longer sequence.
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()

### Converting a normal string sequence to InputExample

In [9]:
def read_sequence(input_sentences):
  examples = []
  unique_id = 0
  for sentence in input_sentences:
    line = tokenization.convert_to_unicode(sentence)
    examples.append(InputExample(unique_id=unique_id, text_a=line))
    unique_id += 1
  return examples

### Processing functions for inputs

A function which accepts *an array of strings* as a parameter and the desired *dimension (max 768) of the embedding output* and returns a dictionary with the token as key and the embedding vector as value.

The function builds the estimator and invokes a prediction based on the given inputs.

In [12]:
def get_features(input_text, dim=768,output_labels=None,output_file='embeddings.json',
             start_from_idx=0,use_TPU=False,doc_embedding=True):
    
    layer_indexes = LAYERS
    bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
    tokenizer = tokenization.FullTokenizer(
      vocab_file=VOCAB_FILE, do_lower_case=True)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
    run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      tpu_config=tf.contrib.tpu.TPUConfig(
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=is_per_host))

    examples = read_sequence(input_text)
    print("example len is ",len(examples))

    features = convert_examples_to_features(
      examples=examples, seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer)
    print('\nsample features are:',features[1].tokens)

    
    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    print("features len is ",len(features))
    username = input("Press return to continue...")
    
    model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      layer_indexes=layer_indexes,
      use_tpu=use_TPU,
      use_one_hot_embeddings=True)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=use_TPU,
      model_fn=model_fn,
      config=run_config,
      predict_batch_size=BATCH_SIZE,
      train_batch_size=BATCH_SIZE)

    input_fn = input_fn_builder(
      features=features, seq_length=MAX_SEQ_LENGTH)

    # Get features
    output_dict = {}
    all_embeddings = []
    counter = 0
    for result in estimator.predict(input_fn, yield_single_examples=True):
        unique_id = int(result["unique_id"])
        feature = unique_id_to_feature[unique_id]
        output = collections.OrderedDict()
        sentence_dict = {}
        for (i, token) in enumerate(feature.tokens):
            layers = []
            for (j, layer_index) in enumerate(layer_indexes):
                layer_output = result["layer_output_%d" % j]
                layer_output_flat = np.array([x for x in layer_output[i:(i + 1)].flat])
                layers.append(layer_output_flat)
                
            if doc_embedding==True:
                if token == '[CLS]':
                    output[token] = list(sum(layers)[:dim]) # You can also concatenate or average layers here
                    sentence_dict.update({token:str(output[token])})
            else:
                output[token] = list(sum(layers)[:dim]) # You can also concatenate or average layers here
                sentence_dict.update({token:str(output[token])})
                
        output_dict[str(counter+start_from_idx)] = {'article':str(output_labels[counter]),'vectors':sentence_dict}
        counter += 1

    print('estimator is finalized and is at sentence', counter)
    print('witing to disk...')

    with open(output_file, 'w') as json_file:
        json.dump(output_dict, json_file)

    print('Finished writing embeddings with len',len(output_dict))

    return True

In [None]:
def get_features(input_text, dim=768,output_labels=None,output_file='embeddings.json',
                 start_from_idx=0,use_TPU=False):
  layer_indexes = LAYERS

  bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=VOCAB_FILE, do_lower_case=True)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      tpu_config=tf.contrib.tpu.TPUConfig(
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=is_per_host))

  examples = read_sequence(input_text)
  print("example len is ",len(examples))

  features = convert_examples_to_features(
      examples=examples, seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer)

  unique_id_to_feature = {}
  for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

  print("features len is ",len(features))

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      layer_indexes=layer_indexes,
      use_tpu=use_TPU,
      use_one_hot_embeddings=True)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=use_TPU,
      model_fn=model_fn,
      config=run_config,
      predict_batch_size=BATCH_SIZE,
      train_batch_size=BATCH_SIZE)

  input_fn = input_fn_builder(
      features=features, seq_length=MAX_SEQ_LENGTH)

  # Get features
  output_dict = {}
  all_embeddings = []
  counter = 0
  for result in estimator.predict(input_fn, yield_single_examples=True):
    unique_id = int(result["unique_id"])
    feature = unique_id_to_feature[unique_id]
    output = collections.OrderedDict()
    sentence_dict = {}
    for (i, token) in enumerate(feature.tokens):
      layers = []
      for (j, layer_index) in enumerate(layer_indexes):
        layer_output = result["layer_output_%d" % j]
        layer_output_flat = np.array([x for x in layer_output[i:(i + 1)].flat])
        layers.append(layer_output_flat)
      output[token] = list(sum(layers)[:dim]) # You can also concatenate or average layers here 
      sentence_dict.update({token:str(output[token])})

    output_dict[str(counter+start_from_idx)] = {'article':str(output_labels[counter]),'vectors':sentence_dict}
    counter += 1
    
  print('estimator is finalized and is at sentence', counter)
  print('witing to disk...')

  with open(output_file, 'w') as json_file:
    json.dump(output_dict, json_file)

  print('Finished writing embeddings with len',len(output_dict))

  return True

### Run the model and get the embeddings

In [11]:
data = pd.read_csv('/mnt/16A4A9BCA4A99EAD/GoogleDrive/Data/Corpus/KPRIS/clean/abstract_title deflemm',names=['sentence'])
data

Unnamed: 0,sentence
0,lighting system disclosed comprising plurality...
1,photographic device eliminates photometric err...
2,print demand camera system camera unit incorpo...
3,simplified camera mechanism method allowing op...
4,beam splitter digital camera split light passe...
...,...
19705,display engine video graphic system includes p...
19706,facilitating interaction may enabled communica...
19707,present invention generally directed system me...
19708,disclosed system producing image including app...


In [13]:
tic = time.clock()

"""
period              start_from_idx
1990-2004           0
2005-2007           43482
2008-2010           17873+43482=61355
2011-2013           17873+43482+24613=85968
2014-2016           17873+43482+24613+26560=112528
2017-2018           17873+43482+24613+26560+40386=152914
"""
TPU_ADDRESS = ''
get_features(data['sentence'][:].values.tolist(),
  output_file = '/mnt/16A4A9BCA4A99EAD/tmp bert results/[CLS] results.json',
  output_labels = list(data.index),
  start_from_idx=0)

# get_features(data['sentence'][70000:140000].values.tolist(), dim=300,
#   output_file = 'drive/My Drive/Data/1990-2018 corpus sentences abstract-title - vectors 300 - 70-140k.json',
#   output_labels = data['article_index'][70000:140000].values.tolist(),start_from_idx=70000)

# get_features(data['sentence'][140000:].values.tolist(), dim=300,
#   output_file = 'drive/My Drive/Data/1990-2018 corpus sentences abstract-title - vectors 300 - 140-205k.json',
#   output_labels = data['article_index'][140000:].values.tolist(),start_from_idx=140000)

toc = time.clock()
print('\nTime of process for all sentences:',toc-tic)


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



  """Entry point for launching an IPython kernel.


example len is  19710
INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] lighting system disclosed comprising plural ##ity image projection lighting device including first image projection lighting device central controller communication system permit central controller communicate plural ##ity image projection lighting device first image projection lighting device comprised camera light valve wherein light valve used project first projected image onto projection surface camera first image projection light device capture least portion first projected image pr [SEP]
INFO:tensorflow:input_ids: 102 16703 429 22118 12201 22533 208 1572 6815 16703 3264 1471 705 1572 6815 16703 3264 2435 4919 2498 429 9362 2435 4919 11160 22533 208 1572 6815 16703 3264 705 1572 6815 16703 3264 10508 5859 2011 9042 19749 2011 9042 501 2575 705 11643 1572 5095 6815 1437 5859 705 1572 6815 2011 3264 4083 1823 6492 705 11643 1572 492 103 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  nam

INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKP

INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO



In [None]:
files.download('drive/My Drive/Data/1990-2018 corpus sentences abstract-title - vectors - all')

In [None]:
print(data.head(5))
print(data[-10:].values.tolist())

In [None]:
!ls

Restart kernel here to free up space

In [1]:
import json
import pandas as pd
output_file = '/mnt/16A4A9BCA4A99EAD/tmp bert results/[CLS] results.json'
with open(output_file) as f:
    results = json.loads(f.read())
results_clean = [results[x]['vectors']['[CLS]'][1:-1].split(', ') for x in results]
print(len(results_clean))
print(len(results_clean[0]))

In [3]:
results_clean = pd.DataFrame(results_clean)
results_clean.to_csv('/mnt/16A4A9BCA4A99EAD/tmp bert results/[CLS] results.csv',index=False)

Read the CSV

In [4]:
import pandas as pd
results = pd.read_csv('/mnt/16A4A9BCA4A99EAD/tmp bert results/[CLS] results.csv')

In [5]:
results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,2.093991,0.481889,0.936157,0.362795,-1.231043,-0.625693,-3.721168,-6.064877,0.714241,-0.606902,...,0.900749,-0.727135,-4.898857,-1.018077,0.680102,1.727248,-0.107641,1.598622,1.403962,-0.623033
1,0.463815,-0.394525,0.873828,-1.372293,-0.601543,1.371147,-1.653654,-3.728278,1.308728,-1.877245,...,0.523725,-0.375501,-4.733455,-1.548404,0.737615,0.732650,-1.483686,1.427930,-0.722826,0.316824
2,-1.444905,-0.496883,0.143387,-0.016598,-0.548146,0.161970,-3.092156,-3.658966,1.391729,-0.732584,...,0.114351,-1.909591,-4.020740,-3.141164,-0.884360,2.126058,-2.103223,1.011418,1.328651,0.043055
3,0.129338,1.244845,-0.099512,-0.016150,-2.191903,0.672146,-3.541779,-2.563775,0.481966,-1.218525,...,1.030245,-2.337068,-5.826959,-1.795447,0.804469,2.098383,-1.806895,-1.718628,0.792790,-1.045472
4,-0.203263,1.313853,2.369816,0.086279,-0.161791,0.479322,-3.706724,-3.451765,1.136608,0.958250,...,1.516233,-0.151334,-3.595501,-0.376670,-0.894569,1.587983,0.075020,0.405111,-0.419986,-2.057433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19705,-0.016106,-0.246208,-0.047454,-2.847529,0.080989,0.421679,-0.905605,-5.266561,2.162794,0.462337,...,1.059816,1.197620,-4.490421,1.088950,-0.083379,0.621057,-2.521836,1.762247,-0.572201,0.205469
19706,3.810331,2.579611,1.169846,-1.085377,-1.651331,-0.098971,-2.075445,-4.282493,3.299706,-3.023206,...,-1.179699,-3.143414,-3.506091,0.881107,-0.778475,1.073418,-0.001388,1.839229,-0.939617,-1.775609
19707,1.894238,1.973035,1.086454,-1.864109,-0.780507,-1.720195,-5.610694,-6.171289,0.486777,0.371356,...,-0.071672,-0.702177,-4.376399,0.603304,-1.550666,3.675741,-1.259607,0.434585,-1.581164,0.000677
19708,-0.894446,-1.722048,1.292111,-1.467540,1.039447,0.882132,-1.665698,-3.301647,1.404132,-0.586678,...,2.221418,1.048154,-5.086555,-2.768722,-0.346986,0.141288,-4.304899,0.077840,1.744139,-1.221411
