In [None]:
from datetime import datetime, timedelta

# Notebook parameters

In [None]:
sample_day = (datetime.now().date() - timedelta(days=1)).strftime('%Y-%m-%d')
source_folder = 'gs://dsart_nearline1/pipelines/samples/100k/'
target_folder = 'gs://dsart_nearline1/pipelines/embeds/'

In [None]:
sample_day

# Dependencies

In [None]:
! pip install "tensorflow==2.11.0"

In [None]:
! pip install "tensorflow-text==2.11.0"

In [None]:
! pip install "tensorflow-hub==0.11.0"

In [None]:
import os
import pandas
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import time
import numpy
import json
from dotenv import load_dotenv

In [None]:
load_dotenv()
tf.get_logger().setLevel('ERROR')

# Load dataframe

In [None]:
file1 = sample_day+'_df.csv'
file1_source = source_folder + file1
file1_source

In [None]:
! gsutil cp {file1_source} .

In [None]:
df = pandas.read_csv(file1, lineterminator='\n')
df

In [None]:
if 'text\r' in df.columns:
    df.rename(columns={'text\r':'text'}, inplace=True)

In [None]:
len(df['fid'].unique())

In [None]:
df

In [None]:
model_preprocess = 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3'
model_handle = 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3'

In [None]:
bert_preprocess_model = hub.KerasLayer(model_preprocess)

In [None]:
bert_model = hub.KerasLayer(model_handle)

In [None]:
batch_size = 256

In [None]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
posts = [str(x) for x in df['text'].tolist()]

In [None]:
embeds = []
progress = 0
t0 = time.time()
for chunk in chunker(posts, batch_size):
    preprocessed = bert_preprocess_model(chunk)
    bert_results = bert_model(preprocessed)
    for x in bert_results["pooled_output"]:
        embeds.append(x.numpy())
    progress += 1
    t1 = time.time()
    if progress%10==0:
        print(progress*batch_size, t1-t0)
embeds = numpy.array(embeds)
print(embeds.shape)
t1 = time.time()
print('Done', t1-t0)

In [None]:
file2 = sample_day+'_embeds.npy'
numpy.save(file2, embeds)

In [None]:
! gsutil cp {file2} {target_folder}