# Work on embeddings

Objective: Take Podcast Descriptions and create embeddings from them.  Create an application that allows a user to enter their own text and use cosine similarity to find the most similar show descriptions (or episodes based on episode description).  


In [334]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
import pickle
import tqdm
import os
sns.set_style('darkgrid')

In [222]:
df = pd.read_csv('../../metadata_with_episode_dates_and_category.tsv',sep='\t')
df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d').reset_index(drop=True)
df = df[~df['release_date'].isna()]
df = df[~df['category'].isna()]
df = df[~df['show_description'].isna()]
df = df[~df['show_name'].isna()]
df.shape

(90871, 18)

In [223]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
import yaml
import numpy as np

In [224]:
print(f"Shows: {len(show_descriptions)}, Episodes: {len(episode_descriptions)}")

Shows: 15857, Episodes: 90871


In [225]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = tf.cast(tf.tile(tf.expand_dims(attention_mask, -1), [1, 1, token_embeddings.shape[-1]]), tf.float32)
    return tf.math.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.math.maximum(tf.math.reduce_sum(input_mask_expanded, 1), 1e-9)


#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='tf')

    # Compute token embeddings
    model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = tf.math.l2_normalize(embeddings, axis=1)

    return embeddings

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
model = TFAutoModel.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/multi-qa-MiniLM-L6-cos-v1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


# Example from Huggingface
https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1

In [226]:
# Sentences we want sentence embeddings for
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

#Encode query and docs
query_emb = encode(query)
doc_emb = encode(docs)

#Compute dot score between query and all document embeddings
scores = (query_emb @ tf.transpose(doc_emb))[0].numpy().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(f"Score: {score:.2f}, Query: {doc}.")

Score: 0.92, Query: Around 9 Million people live in London.
Score: 0.49, Query: London is known for its financial district.


# Benchmarks

## Objective: Benchmark Performing Comparisons

Benchmark embeddings comparison for all Show Descriptions

In [227]:
randomquery_raw = "Random Query"
random_query = encode(randomquery_raw)
large_embed = tf.constant(np.random.random((100000,384)), dtype=tf.float32)
start = time.time()
scores = (random_query @ tf.transpose(large_embed))[0].numpy().tolist()
end = time.time()
print(f"{end-start:.2f}s to compare a worst cast dot product with 100000 rows.")

0.35s to compare a worst cast dot product with 100000 rows.


## Objective: Benchmark Making Show Embeddings

In [228]:
df_filtered = df.drop_duplicates(['show_name','show_description'])[['show_name','show_description']].reset_index(drop=True)
show_descriptions = list(df_filtered['show_description'])

In [229]:
# Note: on show description embedding size of 1000, my macbook ran out of memory.
t = []
for i in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    shows_embeddings = encode(show_descriptions[:i])
    end = time.time()
    t.append(end-start)
    print(f"{end-start:.2f}s: {i} Show Embedding(s)")

0.08s: 1 Show Embedding(s)
0.09s: 3 Show Embedding(s)
0.12s: 10 Show Embedding(s)
0.27s: 30 Show Embedding(s)
1.10s: 100 Show Embedding(s)
9.62s: 300 Show Embedding(s)


In [230]:
embedding_size_trials = np.array([1, 3, 10, 30, 100, 300])
# take the embedding size trials and then take the argmax of the Embeddings Per Second Metric
# which gives you the best index from embedding_size_trials.  this becomes the best_size.
embeddings_per_sec = embedding_size_trials / np.array(t)
best_index = np.argmax(embeddings_per_sec)
best_size = embedding_size_trials[best_index]
print(f"Best Show Embeddings size: {best_size}.  Maximizes Embeddings Per Second on my local machine")


Best Show Embeddings size: 30.  Maximizes Embeddings Per Second on my local machine


In [231]:
# To attempt embeddings for each show...
# create 30 embeddings at a time would take: 
print(f"{df_filtered.shape[0] / embeddings_per_sec[best_index] / 60 :.2f} minutes to create all show embeddings.")

2.39 minutes to create all show embeddings.


## Objective: Benchmark Making Episode Embeddings

In [232]:
df_filtered = df.drop_duplicates(['show_name','show_description'])[['show_name','show_description']].reset_index(drop=True)
episode_descriptions = list(df['episode_description'])

In [233]:
# Note: on show description embedding size of 1000, my macbook ran out of memory.
t = []
for i in [1, 3, 10, 30, 100]:
    start = time.time()
    episode_embeddings = encode(episode_descriptions[:i])
    end = time.time()
    t.append(end-start)
    print(f"{end-start:.2f}s: {i} Episode Embedding(s)")

0.16s: 1 Episode Embedding(s)
0.12s: 3 Episode Embedding(s)
0.20s: 10 Episode Embedding(s)
0.76s: 30 Episode Embedding(s)
4.25s: 100 Episode Embedding(s)


In [234]:
embedding_size_trials = np.array([1, 3, 10, 30, 100])
# take the embedding size trials and then take the argmax of the Embeddings Per Second Metric
# which gives you the best index from embedding_size_trials.  this becomes the best_size.
embeddings_per_sec = embedding_size_trials / np.array(t)
best_index = np.argmax(embeddings_per_sec)
best_size = embedding_size_trials[best_index]
print(f"Best Episode Embeddings size: {best_size}.  Maximizes Embeddings Per Second on my local machine")

Best Episode Embeddings size: 10.  Maximizes Embeddings Per Second on my local machine


In [235]:
# To attempt embeddings for each show...
# create 30 embeddings at a time would take: 
print(f"{df.shape[0] / embeddings_per_sec[best_index] / 60 :.2f} minutes to create all episode embeddings.")

30.56 minutes to create all episode embeddings.


# Test Code for Generating encodings

In [236]:
# Create DataFrame with text key (column: show_name) and text to embed (column: show_description) 
df_filtered = df.drop_duplicates(['show_name','show_description'])[['show_name','show_description']].reset_index(drop=True)

# Create blocks of data and call encode
# Save the data in a systematic way.

In [239]:
# Block size
block_size = 30

# Iterate over consecutive blocks of rows
num_rows = len(df_filtered)
start_index = 0
counter = 0
while start_index < num_rows:
    end_index = start_index + block_size if start_index + block_size < num_rows else num_rows
    subset_df = df.iloc[start_index:end_index]

    # Apply the encode function to the current block
    #encode_function(subset_df)
    #save_data(subset_df
    start_index = end_index
    counter += 1

In [240]:
# Example data structure of embeddings being saved.
embedding_size = 384
block_size = 30
num_blocks = 3
num_padding = 5
files = []
for i in range(num_blocks):
    # fake data
    d = {
        "block":i,
        "show_names":['show1','show2','show3'] * 10,
        "show_desc_embeddings": np.ones((block_size,embedding_size))*i
    }
    block_num = d['block']
    filename = f"{block_num:0{num_padding}}_data.pkl"
    files.append(filename)
    # Save to disk
    with open(filename, 'wb') as f:
        pickle.dump(d, f)

In [244]:
def extract_shows(filenames):
    """
    input args:
        files - list of files used to save the embeddings.
    return args: 
        embedding_matrix - a numpy array of size (block_size x num_blocks,embedding_size) 
        list_of_shows - a list of strings of len (block_size x num_blocks)
    """
    # This code takes in a list of files, then loads them, 
    # extracts the embeddings and concatenates them with the other embeddings.
    list_of_tensors = []
    list_of_shows = []
    for file in filenames: 
        # Load from disk
        with open(file, 'rb') as f:
            loaded_data = pickle.load(f)
        list_of_tensors.append(loaded_data['show_desc_embeddings'])
        list_of_shows.extend(loaded_data['show_names'])
    
    embedding_matrix = np.vstack(list_of_tensors)
    return embedding_matrix, list_of_shows

a,b= extract_shows(files)
print(a.shape, len(b))

(90, 384) 90


# 

In [337]:

class EmbeddingGen:
    def __init__(self, df, data_key, label_key, block_size=1, encoder=None):
        self.data_frame = df
        self.data_key = data_key
        self.label_key = label_key
        self.block_size = block_size
        self.encoder = encoder

        self.files = []
        self.embedding_matrix = []
        self.embedding_labels = []
        # 
        self._saveEmbeddingChunks()
        self._combineEmbeddings()

    def load(self,file):
        """
        Loading embeddings from file.
        """
        with open(file, 'rb') as f:
            loaded_data = pickle.load(f)
        self.embedding_matrix = loaded_data['embeddings']
        self.embedding_labels = loaded_data['embedding_labels']
    
    def compare(self,query,n=5):
        df = self.data_frame
        dataemb = tf.constant(self.embedding_matrix,dtype="float32")
        #Compute dot score between query and all document embeddings
        scores = (query @ tf.transpose(dataemb))[0].numpy().tolist()
        sorted_indices = np.argsort(scores)[::-1]
        lst = []
        for i in sorted_indices[:n]:
            #print(f"{embgen.embedding_labels[i]}: {scores[i]:.2f}")
            #print(df[df[objs.label_key]==embgen.embedding_labels[i]][objs.data_key].iloc[0])
            #print('')
            #print('')
            lst.append({
                "label": self.embedding_labels[i],
                "score": scores[i],
                "data":df[df[self.label_key]==self.embedding_labels[i]][self.data_key].iloc[0]
            })
        return lst
    
    def _saveEmbeddingChunks(self):
        """
        """
        
        # Iterate over consecutive blocks of rows
        num_rows = len(self.data_frame)
        start_index = 0
        block_counter = 0
        est_total_iterations = num_rows // self.block_size
        # Initialize tqdm with the total number of iterations
        with tqdm.tqdm(total=est_total_iterations, desc="Saving Embeddings in Chunks") as pbar:
        # Start your while loop
            while start_index < num_rows:
                end_index = start_index + self.block_size if start_index + self.block_size < num_rows else num_rows
                subset_df = self.data_frame.iloc[start_index:end_index]
            
                # Apply the encode function to the current block
                emb = self.encoder(list(subset_df[self.data_key]))
                emb_labels = list(subset_df[self.label_key])
                self._saveChunk(emb,emb_labels,block_counter)
                
                start_index = end_index
                block_counter += 1
                pbar.update(1)

    def save(self):
        self._saveChunk(self.embedding_matrix, self.embedding_labels, filename="final.pkl")
    
    def _saveChunk(self, embeddings, embedding_labels, block_num=0, filename=None):
        """
        """
        d = {
            "block":block_num,
            "embedding_labels":embedding_labels,
            "embeddings": embeddings
        }
        block_num = d['block']
        if not filename: 
            filename = f"{block_num:0{num_padding}}_data.pkl"
            self.files.append(filename)
            
        # Save to disk
        with open(filename, 'wb') as f:
            pickle.dump(d, f)
    
    def cleanup(self):
        # Explicit method for cleaning up resources
        for file in self.files:
            try:
                os.remove(file)
                #print(f"File {file} deleted successfully.")
            except Exception as e:
                print(f"Error deleting file {file}: {e}")
                
    def _combineEmbeddings(self):
        """
        """
        # This code takes in a list of files, then loads them, 
        # extracts the embeddings and concatenates them with the other embeddings.
        list_of_embeddings = []
        list_of_labels = []
        for file in tqdm.tqdm(self.files,desc="Combining Embeddings"): 
            # Load from disk
            with open(file, 'rb') as f:
                loaded_data = pickle.load(f)
            list_of_embeddings.append(loaded_data['embeddings'])
            list_of_labels.extend(loaded_data['embedding_labels'])
        
        self.embedding_matrix = np.vstack(list_of_embeddings)
        self.embedding_labels = list_of_labels


start = time.time()
embgen = EmbeddingGen(df_filtered, data_key='show_description', label_key='show_name', block_size=30, encoder=encode)
embgen.save()
end = time.time()
print(f"{end-start}")

Saving Embeddings in Chunks: 529it [02:37,  3.36it/s]                                             
Combining Embeddings: 100%|███████████████████████████████████| 529/529 [00:00<00:00, 2765.64it/s]


157.60493803024292


In [342]:
embgen.cleanup()

File 00000_data.pkl deleted successfully.
File 00001_data.pkl deleted successfully.
File 00002_data.pkl deleted successfully.
File 00003_data.pkl deleted successfully.
File 00004_data.pkl deleted successfully.
File 00005_data.pkl deleted successfully.
File 00006_data.pkl deleted successfully.
File 00007_data.pkl deleted successfully.
File 00008_data.pkl deleted successfully.
File 00009_data.pkl deleted successfully.
File 00010_data.pkl deleted successfully.
File 00011_data.pkl deleted successfully.
File 00012_data.pkl deleted successfully.
File 00013_data.pkl deleted successfully.
File 00014_data.pkl deleted successfully.
File 00015_data.pkl deleted successfully.
File 00016_data.pkl deleted successfully.
File 00017_data.pkl deleted successfully.
File 00018_data.pkl deleted successfully.
File 00019_data.pkl deleted successfully.
File 00020_data.pkl deleted successfully.
File 00021_data.pkl deleted successfully.
File 00022_data.pkl deleted successfully.
File 00023_data.pkl deleted succes

In [341]:
pprint.pprint(embgen.compare(query0))

[{'data': '2 Private Cannabis Investors share thoughts, analysis and opinions '
          'on the ups and downs of the rapidly changing Cannabis Investing '
          'landscape. For Investors By Investors',
  'label': 'Cannabis Investing Network',
  'score': 0.8425226211547852},
 {'data': 'As a business accelerator and investor in ancillary cannabis '
          'companies, we’ve helped launch 90+ companies and have made 110+ '
          'investments. And along the way, we’ve seen a lot! Now we’re sharing '
          'everything you need to know about starting up and investing in the '
          'legal cannabis industry. We post every other week! Learn more about '
          'what we do by visiting us at www.canopyboulder.com. ',
  'label': 'CanopyBoulder Cannabis Business Podcast',
  'score': 0.7082740664482117},
 {'data': 'Focused on cannabis growing and the culture, technology, and more '
          'for cannabis growers, users, and curious folks alike!',
  'label': 'Greenstalk Talks

In [326]:
embgen.embedding_labels[:10]

['No Frillz Podcast with Yipes & Matrix',
 'Ayodya Talk',
 'Coca-cola',
 'The Motivational Dude Podcast',
 'Uwu',
 'The Phoenix Project Podcast',
 'Is it in yet? A sex podcast',
 'Star Wars Sessions',
 'The Culture Project Podcast',
 'Women That Wait (WTW)']

In [338]:
REF_INDEX = 505
desc = df_filtered['show_description'].iloc[REF_INDEX]
name = df_filtered['show_name'].iloc[REF_INDEX]
print(f"{name}: {desc}")

Cannabis Investing Network: 2 Private Cannabis Investors share thoughts, analysis and opinions on the ups and downs of the rapidly changing Cannabis Investing landscape. For Investors By Investors


In [339]:
query0_raw = "Private Cannabis Investors share thoughts."
query0 = encode(query0_raw)

In [311]:
dataemb = tf.constant(embgen.embedding_matrix,dtype="float32")
#Compute dot score between query and all document embeddings
scores = (query0 @ tf.transpose(dataemb))[0].numpy().tolist()
#display(scores)
sorted_indices = np.argsort(scores)[::-1]
sorted_indices

array([100,   7, 154, 143,  32,  80,  44, 142,  13, 119, 130,  10,  85,
       102, 109,  54, 104, 147, 123,  68,  82, 126,  84,  41,  93,  98,
       118,  15, 146,  90,  55,   4, 134,  30, 149, 108, 141,  42,  67,
       111, 144,  23,  25, 128,   3, 124,  22,  53,  18, 138, 150,  59,
        56,  65,  12,  75,   9,  36, 115,  89,  95, 114,   8,  52,  29,
       151, 113,  81, 135, 127,  49, 133,  39,   1,  63, 101,  47, 148,
        14, 139,  57, 152,  37,  46,  19,  35,  77,  27, 122,  96,  69,
       140,  83, 137, 153,  20, 103,  72,   5, 131,  34,  51, 110, 116,
        48,  99,  94,  97,  38,  40,   6, 125,  60,  66,  88, 106, 121,
        16, 129, 107,  33,  91,  71,  61,  45,  31,  24,  79,  87,  76,
        62,  73,  43,  74,  58, 112,  64,  70,  21, 132, 120,   0,  26,
        17, 117, 136,  86, 145,  92,  11,   2, 105,  50,  78,  28])

In [312]:
for i in sorted_indices[:5]:
    print(f"{embgen.embedding_labels[i]}: {scores[i]:.2f}")
    print(df_filtered[df_filtered['show_name']==embgen.embedding_labels[i]].show_description.iloc[0])
    print('')
    print('')

Insights with Joe Pane: 0.79
This podcast is dedicated to those of us on a journey from ambition to meaning. I share with you the experiences of 1000's of people I have had the honor of coaching, training and leading over the last decade and a half, who have each embarked on this journey. This podcast is about redefining success. Ultimately, success is about the value we have been to someone else. This kind of success flavors our ultimate legacy, which is the contribution we have made to the live's of others. Thank you and I look forward to sharing all I can about this beautiful journey. 


The Good Sign : 0.66
Let’s be real and honest! Life can be challenging and stressful! I am a mom, a teacher, a life coach and a motivational speaker. Meeting so many people from so many places has made me realize that we are all on a similar quest for happiness and inspiration. This podcast will be uplifting, honest and damn funny! Join me each Monday night as me and my guests impart attainable goal

In [318]:
def find_top_n(objs,query,n=5):
    df = objs.data_frame
    dataemb = tf.constant(objs.embedding_matrix,dtype="float32")
    #Compute dot score between query and all document embeddings
    scores = (query @ tf.transpose(dataemb))[0].numpy().tolist()
    sorted_indices = np.argsort(scores)[::-1]
    lst = []
    for i in sorted_indices[:n]:
        #print(f"{embgen.embedding_labels[i]}: {scores[i]:.2f}")
        #print(df[df[objs.label_key]==embgen.embedding_labels[i]][objs.data_key].iloc[0])
        #print('')
        #print('')
        lst.append({
            "label": embgen.embedding_labels[i],
            "score": scores[i],
            "data":df[df[objs.label_key]==embgen.embedding_labels[i]][objs.data_key].iloc[0]
        })
    return lst
import pprint
pprint.pprint(find_top_n(embgen, query0, n=5))

[{'data': 'This podcast is dedicated to those of us on a journey from ambition '
          "to meaning. I share with you the experiences of 1000's of people I "
          'have had the honor of coaching, training and leading over the last '
          'decade and a half, who have each embarked on this journey. This '
          'podcast is about redefining success. Ultimately, success is about '
          'the value we have been to someone else. This kind of success '
          'flavors our ultimate legacy, which is the contribution we have made '
          "to the live's of others. Thank you and I look forward to sharing "
          'all I can about this beautiful journey. ',
  'label': 'Insights with Joe Pane',
  'score': 0.7919762134552002},
 {'data': 'Let’s be real and honest! Life can be challenging and stressful! I '
          'am a mom, a teacher, a life coach and a motivational speaker. '
          'Meeting so many people from so many places has made me realize that '
          '

In [322]:
pprint.pprint(embgen.compare(query0))

[{'data': 'This podcast is dedicated to those of us on a journey from ambition '
          "to meaning. I share with you the experiences of 1000's of people I "
          'have had the honor of coaching, training and leading over the last '
          'decade and a half, who have each embarked on this journey. This '
          'podcast is about redefining success. Ultimately, success is about '
          'the value we have been to someone else. This kind of success '
          'flavors our ultimate legacy, which is the contribution we have made '
          "to the live's of others. Thank you and I look forward to sharing "
          'all I can about this beautiful journey. ',
  'label': 'Insights with Joe Pane',
  'score': 0.7919762134552002},
 {'data': 'Let’s be real and honest! Life can be challenging and stressful! I '
          'am a mom, a teacher, a life coach and a motivational speaker. '
          'Meeting so many people from so many places has made me realize that '
          '