In [None]:
from datetime import datetime, timedelta

# Notebook parameters

In [None]:
day = (datetime.now().date() - timedelta(days=1)).strftime('%Y-%m-%d')
model_folder = 'gs://dsart_nearline1/models/'
model_file = 'nn_transformer_202405'
source_folder1 = 'gs://dsart_nearline1/pipelines/embeds/'
source_folder2 = 'gs://dsart_nearline1/pipelines/samples/100k/'
target_folder = 'gs://dsart_nearline1/pipelines/gambit1/'

In [None]:
day, model_folder, model_file, source_folder1, source_folder2, target_folder 

# Dependencies

In [None]:
! pip install "tensorflow==2.11.0"

In [None]:
import os
import pandas
import tensorflow as tf
import time
import numpy
import json
from dotenv import load_dotenv

In [None]:
load_dotenv()
tf.get_logger().setLevel('ERROR')

# Prepare Data

## Load embeddings

In [None]:
file1 = day+'_embeds.npy'
file1_source = source_folder1 + file1
file1_source

In [None]:
! gsutil cp {file1_source} .

In [None]:
embeds = numpy.load(file1)
embeds.shape

## Load CSV

In [None]:
file2 = day+'_df.csv'
file2_source = source_folder2 + file2
file2_source

In [None]:
! gsutil cp {file2_source} .

In [None]:
df0 = pandas.read_csv(file2, lineterminator='\n')
df0

## Load Model

In [None]:
model_source_h5 = model_folder + model_file + '.h5' 
model_source_meta = model_folder + model_file + '.json' 
model_source_h5

In [None]:
! gsutil cp {model_source_h5} {model_source_meta} .

In [None]:
with open(model_file + '.json' ) as f:
    meta = json.load(f)
meta

In [None]:
model = tf.keras.models.load_model(model_file+'.h5')
model.summary()

# Run Model

In [None]:
preds = model.predict(embeds)

In [None]:
preds_q = preds[0]
preds_cats = preds[1]
preds_topics = preds[2]
preds_auto = preds[4]
preds_q.shape, preds_cats.shape, preds_topics.shape, preds_auto.shape

In [None]:
df_q = pandas.DataFrame(preds_q, columns=meta['questions'])
df_q.describe()

In [None]:
df_cats = pandas.DataFrame(preds_cats, columns=meta['categories'])
df_cats.describe()

In [None]:
df_topics = pandas.DataFrame(preds_topics, columns=meta['topics'])
df_topics.describe()

In [None]:
df_encoder = pandas.DataFrame(preds_auto, columns=meta['encoder'])
df_encoder.describe()

In [None]:
df = df0.join(df_q).join(df_cats).join(df_topics).join(df_encoder)
df

# Save output and upload to GS

In [None]:
output_file = day + '.csv'
output_file

In [None]:
df.to_csv(output_file, index=False, float_format='%.2f')

In [None]:
! gsutil cp {output_file} {target_folder}