# Semantic Search

## Environment

In [73]:
!pip install sentence_transformers tiktoken openai -qqq

In [74]:
import numpy as np
import pandas as pd
import pyarrow

import openai
from openai.embeddings_utils import get_embedding

from sentence_transformers import SentenceTransformer
import torch

In [75]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/semantic_song_search

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Projects/semantic_song_search


## Load Data

In [76]:
df = pd.read_parquet('data/v2ga_w_embeddings.parquet')
print(df.shape)
df.info()

(17000, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   title              17000 non-null  object  
 1   tag                17000 non-null  category
 2   artist             17000 non-null  object  
 3   year               17000 non-null  int16   
 4   views              17000 non-null  int32   
 5   lyrics             17000 non-null  object  
 6   normalized_lyrics  17000 non-null  object  
 7   word_count         17000 non-null  int64   
 8   embedding_minilm   17000 non-null  object  
 9   embedding_roberta  17000 non-null  object  
 10  n_tokens           17000 non-null  int64   
 11  embedding_gpt      17000 non-null  object  
 12  embedding_glove    17000 non-null  object  
dtypes: category(1), int16(1), int32(1), int64(2), object(8)
memory usage: 1.4+ MB


In [93]:
pd.set_option('max_colwidth', 100)
df.sample(5).T

Unnamed: 0,5678,1767,15550,4254,14351
title,Bye Bye Baby First Day Ruff,Century City,Mirror Mirror,Bring It Up finale,You Bring The Party Down
tag,pop,pop,rock,pop,rock
artist,Madonna,Tom Petty,Dr. Dog,James Brown,Ringo Starr
year,2019,2015,2010,2015,2015
views,48,238,1440,16,178
lyrics,"\nThis is not a love song\nBye bye baby\n\n\nI keep on waiting, anticipating\nBut I can't wait f...",Tom Petty\nMiscellaneous\nCentury City\nSometimes I wanna leave here\nSometimes I wanna go right...,\nAt the bottom there's a stir\nAnd now surface is a blur\nTell the truth like it's a joke\nAnd ...,"Gather round, clap your hands\nCome on and dance\nCome on, hit it\nCome on, hit it, hit it\nHit ...",\nI woke up this morning and opened up my eyes\nI made myself some coffee and then I realized\nS...
normalized_lyrics,love song bye bye baby keep wait anticipate wait forever say love think never together bye bye b...,tom petty miscellaneous century city sometimes wan na leave sometimes wan na go right back come ...,bottom stir surface blur tell truth like joke fix part break know memory know get tell mirror wa...,gather round clap hand come dance come hit come hit hit hit come baby get groove come baby let t...,woke morning open eye make coffee realize people good people bad people crazy still sad town bri...
word_count,415,209,221,171,148
embedding_minilm,"[-0.08844968, -0.047948506, 0.06385774, 0.0159767, -0.021659693, -0.063861474, -0.036377, -0.015...","[0.024121314, 0.00457798, 0.030652251, 0.03202586, 0.0642489, 0.017617911, 0.04600115, -0.004578...","[0.01965588, 0.020670744, 0.032517638, 0.007887097, 0.03109951, -0.07320251, 0.042622067, -0.004...","[-0.086906314, -0.024689898, 0.03134726, -0.010827615, -0.0212185, -0.01732983, 0.04547225, -0.0...","[-0.07090017, -0.062312856, 0.08167034, 0.02541433, 0.019856244, -0.030401539, 0.10238811, -0.02..."
embedding_roberta,"[-0.030557323, 0.03000806, -0.025587466, -0.020691887, 0.046736795, 0.0030467778, -0.04445311, 0...","[-0.02053741, 0.03398613, 0.0010403969, 0.023844974, 0.072137505, -0.01613071, -0.060271908, -0....","[-0.011907794, 0.019027017, -0.00056838745, -0.027201895, 0.028142199, -0.013348944, -0.08056891...","[0.013536353, 0.019072898, -0.015574088, -0.029021962, 0.008261879, -0.03534351, -0.02869135, 0....","[-0.02686187, 0.022598142, 0.01815595, 0.045974504, 0.09875624, 0.035978805, -0.05891495, -0.005..."


## Setup Embedding Models

In [78]:
# sentence transformer models
minilm = SentenceTransformer('./all-MiniLM-L12-v2')
roberta = SentenceTransformer('./all-distilroberta-v1')

In [79]:
# openai api config
with open("private/openai_secret_key.txt", "r") as f:
    my_little_secret = f.read()
openai.api_key = my_little_secret
# embedding model
gpt = "text-embedding-ada-002"

In [80]:
# glove
glove = SentenceTransformer('./average_word_embeddings_glove.840B.300d')

## Search

In [81]:
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    v1_norm = np.linalg.norm(v1)
    v2_norm = np.linalg.norm(v2)
    if v1_norm == 0.0 or v2_norm == 0.0:
        return np.nan
    else:
        similarity = dot_product / (v1_norm * v2_norm)
        return similarity

In [82]:
def relevance_scores(query_embed,df,embeddings):
    scores = [cosine_similarity(query_embed, v2) for v2 in df[embeddings]]
    scores = pd.Series(scores)
    return(scores)

In [83]:
def semantic_search(query_text, df = df, model = "glove", return_top = True):
    
    df = df.copy()

    if model == "glove":
      query_embed = glove.encode(query_text)
      scores = relevance_scores(query_embed,df,"embedding_glove")

    if model == "minilm":
      query_embed = minilm.encode(query_text)
      scores = relevance_scores(query_embed,df,"embedding_minilm")

    if model == "roberta":
      query_embed = roberta.encode(query_text)
      scores = relevance_scores(query_embed,df,"embedding_roberta")

    if model == "gpt":
      query_embed = get_embedding(query_text, engine=gpt)
      scores = relevance_scores(query_embed,df,"embedding_gpt")
    
    df['scores'] = scores
    sorted_df = df.sort_values(by = 'scores', ascending = False)

    if return_top:
        result = sorted_df.iloc[0]
        print("\n==============================================\n")
        print(f"Embedding Model: {model}")
        print(f'{result["title"]} by {result["artist"]}\n')
        print(result["lyrics"])
        print("\n==============================================\n")
    else:
        return sorted_df[['title','artist','lyrics','scores']]

In [84]:
pd.set_option('display.width', 100)

## Example

In [85]:
q = "I’m happy sometimes but also sad. You wouldn’t understand"

In [86]:
semantic_search(q, df, "glove", return_top=True)



Embedding Model: glove
Dont Be Afraid of Love by Otis Redding

Don't be afraid of love
Don't be afraid of love

If you want it you can get it
Just ask for
If you want it you can get it
Just ask for

If you want it you can get it
Just ask for it
No, don't, no, don't be afraid

Everybody, every girl, need love
Every boy, every girl, need love
Everybody in this world need love
No, don't, no, no, no, don't be afraid

If you want it you can get it
Just ask for
If you want it you can get it
Just ask for
If you want it you can get it
Just ask for it
No, don't, don't, don't don't, don't be afraid

If you love someone you should tell
If you love someone you should tell
If you love someone you should tell
No, don't, no, don't don't, don't be afraid

Don't be afraid of love
Don't be afraid of love
Don't be afraid of love
Don't be afraid of love




In [87]:
semantic_search(q, df, "minilm", return_top=True)



Embedding Model: minilm
Thieves by Incubus


You're happy all the time
I just don't understand why I can't be happy too
Your smiles are salt in the wound
A slap upon a back that's been toiling in the sun


When will I get mine
Or must I be a god-fearing, white American
Oh, everything is fine
As long as you're a god-fearing, white American


Why should the thieves have all of the fun
Selling us water by the river, they don't speak for everyone
I'm ready to run and you're making me crawl
Selling me water by the river, they don't speak for me at all


The man about the town
The beauty queen, the paragon of civilization
But shadows cling to us all
Even those convinced that they're sheltered and immune

When will I get mine
Or must I be a god-fearing, white American
Oh, everything is fine
As long as you're a god-fearing, white American


Why should the thieves have all of the fun
Selling us water by the river, they don't speak for everyone
I'm ready to run and you're making me crawl
Selli

In [88]:
semantic_search(q, df, "roberta", return_top=True)



Embedding Model: roberta
You Will Be Sad by Washed Out

You will be sad




In [89]:
semantic_search(q, df, "gpt", return_top=True)



Embedding Model: gpt
Sometimes Im Happy by Frank Sinatra

Sometimes I'm happy, sometimes I'm blue
My disposition, it depends on you
I never mind the rain from the skies
If I can find the sun in your eyes
Sometimes I love you, sometimes I hate you
But when I hate you it's 'cause I love you
That's how I am so what can I do
I'm happy when I'm with you
Sometimes I am happy other times I am blue
My disposition depends on you
I never mind the rain from the sky
If I can find that sun in your eyes
Sometimes I love you, sometimes I hate you
But when I hate you, it's because I love you
That's how I am so what can I do
I'm happy, so happy when I'm with you




## Test Loop

In [90]:
def loop_results(q):
  models = ["glove","minilm","roberta","gpt"]
  for model in models:
    semantic_search(q, df, model, return_top=True)

In [94]:
q = "Accept your conditions as they are. Don't struggle with your fate. Find peace in the current state of things."
loop_results(q)



Embedding Model: glove
State of the Nation by New Order


You can walk, or you can run
You don't have to be someone
I went on a summer cruise
Upon an ocean born to lose
My brother said that he was dead
I saw his face and shook my head
Can you see where we can't be
We're losing our blood in the sea


'Cause it's the state of the nation
That's holding our salvation
Yes, it's the state of the nation
That's holding our salvation
Oh, the state of the nation
Is causing deprivation
Oh, the state of the nation
Is causing deprivation


From my home I traveled far
I drove in my stolen car
When it broke down, I kissed the ground
'Cause I don't kiss when you're around
I don't find that I have been
The portrait of an only son
If that's the case, then who could tell
Where my story had begun?

'Cause it's the state of the nation
That's holding our salvation
Yes, it's the state of the nation
That's holding our salvation
Yes, the state of the nation
That's causing deprivation
Oh, the state of the nat