# Work on embeddings

Objective: Take Podcast Descriptions and Benchmark Creating Embeddings.  


In [23]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
import pickle
import tqdm
import os
sns.set_style('darkgrid')

from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
import yaml
import numpy as np

# Data Cleaning

In [15]:
df = pd.read_csv('../../metadata_with_episode_dates_and_category.tsv',sep='\t')
df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d').reset_index(drop=True)
df = df[~df['release_date'].isna()]
df = df[~df['category'].isna()]
df = df[~df['show_description'].isna()]
df = df[~df['show_name'].isna()]
df = df[~df['episode_description'].isna()]
df = df[~df['episode_name'].isna()]
df.shape

(90706, 18)

In [224]:
print(f"Shows: {len(show_descriptions)}, Episodes: {len(episode_descriptions)}")

Shows: 15857, Episodes: 90871


# Create an Encoder

In [3]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = tf.cast(tf.tile(tf.expand_dims(attention_mask, -1), [1, 1, token_embeddings.shape[-1]]), tf.float32)
    return tf.math.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.math.maximum(tf.math.reduce_sum(input_mask_expanded, 1), 1e-9)


#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='tf')

    # Compute token embeddings
    model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = tf.math.l2_normalize(embeddings, axis=1)

    return embeddings

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
model = TFAutoModel.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")

2023-11-23 15:54:45.452458: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2023-11-23 15:54:45.452480: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-11-23 15:54:45.452484: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-11-23 15:54:45.452632: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-23 15:54:45.452803: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/multi-qa-MiniLM-L6

# Benchmark Making Show Embeddings

In [24]:
df_shows = df.drop_duplicates(['show_name','show_description'])[['show_name','show_description']].reset_index(drop=True)
show_descriptions = list(df_shows['show_description'])

In [28]:
# Note: on show description embedding size of 1000, my macbook ran out of memory.
t = []
number_of_embeddings = [1, 3, 10, 30, 100]
for i in number_of_embeddings:
    start = time.time()
    shows_embeddings = encode(show_descriptions[:i])
    end = time.time()
    t.append(end-start)
    print(f"{end-start:.2f}s: {i} Show Embedding(s)")

0.08s: 1 Show Embedding(s)
0.04s: 3 Show Embedding(s)
0.08s: 10 Show Embedding(s)
0.24s: 30 Show Embedding(s)
1.04s: 100 Show Embedding(s)


In [29]:
# take the embedding size trials and then take the argmax of the Embeddings Per Second Metric
# which gives you the best index from number_of_embeddings.  this becomes the best_size.
embeddings_per_sec = np.array(number_of_embeddings) / np.array(t)
best_index = np.argmax(embeddings_per_sec)
best_size = embedding_size_trials[best_index]
print(f"Best Show Embeddings size: {best_size}.  Maximizes Embeddings Per Second on my local machine")


Best Show Embeddings size: 30.  Maximizes Embeddings Per Second on my local machine


In [32]:
# To attempt embeddings for each show...
# create 30 embeddings at a time would take: 
print(f"Estimate {df_shows.shape[0] / embeddings_per_sec[best_index] / 60 :.2f} minutes to create all show embeddings.")

Estimate 2.12 minutes to create all show embeddings.


# Benchmark Making Episode Embeddings

In [33]:
episode_descriptions = list(df['episode_description'])

In [34]:
# Note: on show description embedding size of 1000, my macbook ran out of memory.
t = []
number_of_embeddings = [1, 3, 10, 30, 100]
for i in number_of_embeddings:
    start = time.time()
    episode_embeddings = encode(episode_descriptions[:i])
    end = time.time()
    t.append(end-start)
    print(f"{end-start:.2f}s: {i} Episode Embedding(s)")

0.19s: 1 Episode Embedding(s)
0.17s: 3 Episode Embedding(s)
0.24s: 10 Episode Embedding(s)
1.97s: 30 Episode Embedding(s)
4.68s: 100 Episode Embedding(s)


In [35]:
# take the embedding size trials and then take the argmax of the Embeddings Per Second Metric
# which gives you the best index from number_of_embeddings.  this becomes the best_size.
embeddings_per_sec = np.array(number_of_embeddings) / np.array(t)
best_index = np.argmax(embeddings_per_sec)
best_size = embedding_size_trials[best_index]
print(f"Best Episode Embeddings size: {best_size}.  Maximizes Embeddings Per Second on my local machine")

Best Episode Embeddings size: 10.  Maximizes Embeddings Per Second on my local machine


In [37]:
# To attempt embeddings for each show...
# create 30 embeddings at a time would take: 
print(f"Estimate {df.shape[0] / embeddings_per_sec[best_index] / 60 :.2f} minutes to create all episode embeddings.")

Estimate 36.13 minutes to create all episode embeddings.
