In [1]:
%pip install --upgrade --user --quiet google-cloud-aiplatform google-cloud-storage

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-api-python-client 1.8.0 requires google-api-core<2dev,>=1.13.0, but you have google-api-core 2.19.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from google.cloud import storage, aiplatform
from vertexai.preview.language_models import TextEmbeddingModel
import vertexai
import tqdm
import time

PROJECT_ID = "{project-id}"
LOCATION = "us-central1"

BUCKET_NAME = " {bucket-name}"
CSV_FILE_PATH = "music.csv"

vertexai.init(project=PROJECT_ID, location=LOCATION)

df = pd.read_csv(CSV_FILE_PATH)

In [2]:
df = df.head(2000)

In [3]:
df['combined_details'] = df.apply(lambda row: f"{row['track_name']} {row['popularity']} duration_ms {row['danceability']} {row['loudness']} {row['track_genre']} {row['tempo']} {row['instrumentalness']}", axis=1)

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [4]:
def get_embeddings_wrapper(texts,batch_size=5):
    embeddings = []
    for i in tqdm.tqdm(range(0, len(texts),batch_size)):
        time.sleep(1)
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.get_embeddings(batch_texts)
        embeddings.extend([embedding.values for embedding in batch_embeddings])
    return embeddings

combined_texts = df['combined_details'].tolist()
df['embedding'] = get_embeddings_wrapper(combined_texts)

100%|██████████| 400/400 [07:18<00:00,  1.10s/it]


In [None]:
df.head()

In [5]:
jsonl_string = df[["id","track_id", "artists","album_name","track_name","embedding"]].to_json(orient="records", lines=True)

with open("songs.json", "w") as f:
    f.write(jsonl_string)

In [7]:
BUCKET_URI = f"gs://no-latency-labs-documents"
! gsutil cp songs.json {BUCKET_URI}

Copying file://songs.json [Content-Type=application/json]...
/ [1 files][ 19.9 MiB/ 19.9 MiB]                                                
Operation completed over 1 objects/19.9 MiB.                                     


In [8]:
from google.cloud import aiplatform
aiplatform.init(project=PROJECT_ID,location=LOCATION)

my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"spotify-songs-idx",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=10,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
)


Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/499192289487/locations/us-central1/indexes/7597066796025249792/operations/2681464253168222208
MatchingEngineIndex created. Resource name: projects/499192289487/locations/us-central1/indexes/7597066796025249792
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/499192289487/locations/us-central1/indexes/7597066796025249792')


In [9]:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"songs-endpoint",
    public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/499192289487/locations/us-central1/indexEndpoints/8814727545275547648/operations/9029287927946936320
MatchingEngineIndexEndpoint created. Resource name: projects/499192289487/locations/us-central1/indexEndpoints/8814727545275547648
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/499192289487/locations/us-central1/indexEndpoints/8814727545275547648')


In [None]:
DEPLOYED_INDEX_ID = f"spotify_songs_idx"

index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/499192289487/locations/us-central1/indexEndpoints/8814727545275547648
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/499192289487/locations/us-central1/indexEndpoints/8814727545275547648/operations/2390982077202825216


In [23]:
# user_prompt = "Find all duets in the data set"

user_prompt = "Recommend some happy songs"
test_embeddings = get_embeddings_wrapper([user_prompt])

response = index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=5,
)
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.track_id.values[0]} {similar.track_name.values[0]}-{similar.artists.values[0]}-{similar.album_name.values[0]}")

100%|██████████| 1/1 [00:01<00:00,  1.10s/it]

0.6941 4G8wt3fUBpgBjI35TBF2eQ Duet-Rachael Yamagata-Elephants...Teeth Sinking Into Heart (Standard Version)
0.6748 6sJRYw9CnzDyZ3mBOAVtNN Dulce-Los Amigos Invisibles-Commercial
0.6742 7HuER5q1a5dJ0uKrfGrMCj Searching-Jlyricz-Searching
0.6732 5HIZ8V82BzmcQUwTqZK5J2 The Ally-Ikebe Shakedown-The Way Home
0.6708 6DU07zLVPOlPqqybYpQZDn Dueña de Mi-La Misa Negra-La Misa Negra



