# Work on embeddings

Objective: Take Podcast Descriptions and create embeddings from them.  Create an application that allows a user to enter their own text and use cosine similarity to find the most similar show descriptions (or episodes based on episode description).  


In [221]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
import pickle
import tqdm
sns.set_style('darkgrid')

In [222]:
df = pd.read_csv('../../metadata_with_episode_dates_and_category.tsv',sep='\t')
df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d').reset_index(drop=True)
df = df[~df['release_date'].isna()]
df = df[~df['category'].isna()]
df = df[~df['show_description'].isna()]
df = df[~df['show_name'].isna()]
df.shape

(90871, 18)

In [223]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
import yaml
import numpy as np

In [224]:
print(f"Shows: {len(show_descriptions)}, Episodes: {len(episode_descriptions)}")

Shows: 15857, Episodes: 90871


In [225]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = tf.cast(tf.tile(tf.expand_dims(attention_mask, -1), [1, 1, token_embeddings.shape[-1]]), tf.float32)
    return tf.math.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.math.maximum(tf.math.reduce_sum(input_mask_expanded, 1), 1e-9)


#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='tf')

    # Compute token embeddings
    model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = tf.math.l2_normalize(embeddings, axis=1)

    return embeddings

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
model = TFAutoModel.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/multi-qa-MiniLM-L6-cos-v1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


# Example from Huggingface
https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1

In [226]:
# Sentences we want sentence embeddings for
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

#Encode query and docs
query_emb = encode(query)
doc_emb = encode(docs)

#Compute dot score between query and all document embeddings
scores = (query_emb @ tf.transpose(doc_emb))[0].numpy().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(f"Score: {score:.2f}, Query: {doc}.")

Score: 0.92, Query: Around 9 Million people live in London.
Score: 0.49, Query: London is known for its financial district.


# Benchmarks

## Objective: Benchmark Performing Comparisons

Benchmark embeddings comparison for all Show Descriptions

In [227]:
randomquery_raw = "Random Query"
random_query = encode(randomquery_raw)
large_embed = tf.constant(np.random.random((100000,384)), dtype=tf.float32)
start = time.time()
scores = (random_query @ tf.transpose(large_embed))[0].numpy().tolist()
end = time.time()
print(f"{end-start:.2f}s to compare a worst cast dot product with 100000 rows.")

0.35s to compare a worst cast dot product with 100000 rows.


## Objective: Benchmark Making Show Embeddings

In [228]:
df_filtered = df.drop_duplicates(['show_name','show_description'])[['show_name','show_description']].reset_index(drop=True)
show_descriptions = list(df_filtered['show_description'])

In [229]:
# Note: on show description embedding size of 1000, my macbook ran out of memory.
t = []
for i in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    shows_embeddings = encode(show_descriptions[:i])
    end = time.time()
    t.append(end-start)
    print(f"{end-start:.2f}s: {i} Show Embedding(s)")

0.08s: 1 Show Embedding(s)
0.09s: 3 Show Embedding(s)
0.12s: 10 Show Embedding(s)
0.27s: 30 Show Embedding(s)
1.10s: 100 Show Embedding(s)
9.62s: 300 Show Embedding(s)


In [230]:
embedding_size_trials = np.array([1, 3, 10, 30, 100, 300])
# take the embedding size trials and then take the argmax of the Embeddings Per Second Metric
# which gives you the best index from embedding_size_trials.  this becomes the best_size.
embeddings_per_sec = embedding_size_trials / np.array(t)
best_index = np.argmax(embeddings_per_sec)
best_size = embedding_size_trials[best_index]
print(f"Best Show Embeddings size: {best_size}.  Maximizes Embeddings Per Second on my local machine")


Best Show Embeddings size: 30.  Maximizes Embeddings Per Second on my local machine


In [231]:
# To attempt embeddings for each show...
# create 30 embeddings at a time would take: 
print(f"{df_filtered.shape[0] / embeddings_per_sec[best_index] / 60 :.2f} minutes to create all show embeddings.")

2.39 minutes to create all show embeddings.


## Objective: Benchmark Making Episode Embeddings

In [232]:
df_filtered = df.drop_duplicates(['show_name','show_description'])[['show_name','show_description']].reset_index(drop=True)
episode_descriptions = list(df['episode_description'])

In [233]:
# Note: on show description embedding size of 1000, my macbook ran out of memory.
t = []
for i in [1, 3, 10, 30, 100]:
    start = time.time()
    episode_embeddings = encode(episode_descriptions[:i])
    end = time.time()
    t.append(end-start)
    print(f"{end-start:.2f}s: {i} Episode Embedding(s)")

0.16s: 1 Episode Embedding(s)
0.12s: 3 Episode Embedding(s)
0.20s: 10 Episode Embedding(s)
0.76s: 30 Episode Embedding(s)
4.25s: 100 Episode Embedding(s)


In [234]:
embedding_size_trials = np.array([1, 3, 10, 30, 100])
# take the embedding size trials and then take the argmax of the Embeddings Per Second Metric
# which gives you the best index from embedding_size_trials.  this becomes the best_size.
embeddings_per_sec = embedding_size_trials / np.array(t)
best_index = np.argmax(embeddings_per_sec)
best_size = embedding_size_trials[best_index]
print(f"Best Episode Embeddings size: {best_size}.  Maximizes Embeddings Per Second on my local machine")

Best Episode Embeddings size: 10.  Maximizes Embeddings Per Second on my local machine


In [235]:
# To attempt embeddings for each show...
# create 30 embeddings at a time would take: 
print(f"{df.shape[0] / embeddings_per_sec[best_index] / 60 :.2f} minutes to create all episode embeddings.")

30.56 minutes to create all episode embeddings.


# Test Code for Generating encodings

In [236]:
# Create DataFrame with text key (column: show_name) and text to embed (column: show_description) 
df_filtered = df.drop_duplicates(['show_name','show_description'])[['show_name','show_description']].reset_index(drop=True)

# Create blocks of data and call encode
# Save the data in a systematic way.

In [239]:
# Block size
block_size = 30

# Iterate over consecutive blocks of rows
num_rows = len(df_filtered)
start_index = 0
counter = 0
while start_index < num_rows:
    end_index = start_index + block_size if start_index + block_size < num_rows else num_rows
    subset_df = df.iloc[start_index:end_index]

    # Apply the encode function to the current block
    #encode_function(subset_df)
    #save_data(subset_df
    start_index = end_index
    counter += 1

In [240]:
# Example data structure of embeddings being saved.
embedding_size = 384
block_size = 30
num_blocks = 3
num_padding = 5
files = []
for i in range(num_blocks):
    # fake data
    d = {
        "block":i,
        "show_names":['show1','show2','show3'] * 10,
        "show_desc_embeddings": np.ones((block_size,embedding_size))*i
    }
    block_num = d['block']
    filename = f"{block_num:0{num_padding}}_data.pkl"
    files.append(filename)
    # Save to disk
    with open(filename, 'wb') as f:
        pickle.dump(d, f)

In [244]:
def extract_shows(filenames):
    """
    input args:
        files - list of files used to save the embeddings.
    return args: 
        embedding_matrix - a numpy array of size (block_size x num_blocks,embedding_size) 
        list_of_shows - a list of strings of len (block_size x num_blocks)
    """
    # This code takes in a list of files, then loads them, 
    # extracts the embeddings and concatenates them with the other embeddings.
    list_of_tensors = []
    list_of_shows = []
    for file in filenames: 
        # Load from disk
        with open(file, 'rb') as f:
            loaded_data = pickle.load(f)
        list_of_tensors.append(loaded_data['show_desc_embeddings'])
        list_of_shows.extend(loaded_data['show_names'])
    
    embedding_matrix = np.vstack(list_of_tensors)
    return embedding_matrix, list_of_shows

a,b= extract_shows(files)
print(a.shape, len(b))

(90, 384) 90


# 