In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("/home/romain/Documents/Personal/Projects/Soundtrack_ML/soundtrack-ml/search/data/processed/track_playlist/tracks_data.csv")
data

Unnamed: 0,playlist_id,playlist_name,track_name,artist_name,album_name,duration_ms,pos
0,61000,glow,Money Longer,Lil Uzi Vert,Lil Uzi Vert Vs. The World,198944,0
1,61000,glow,Broccoli (feat. Lil Yachty),DRAM,Big Baby DRAM,225205,1
2,61000,glow,Still Here,Drake,Views,189853,2
3,61000,glow,You Was Right,Lil Uzi Vert,Lil Uzi Vert Vs. The World,163944,3
4,61000,glow,Don't Hurt Me,DJ Mustard,Don't Hurt Me,192995,4
...,...,...,...,...,...,...,...
6615086,342999,dance it out,iSpy (feat. Lil Yachty),KYLE,iSpy (feat. Lil Yachty),253106,7
6615087,342999,dance it out,Dip Dip,21 Savage,Slaughter King,151614,8
6615088,342999,dance it out,Uber Everywhere (feat. Travis Scott),MadeinTYO,You Are Forgiven,181104,9
6615089,342999,dance it out,Crank That (Soulja Boy),Soulja Boy,souljaboytellem.com,221933,10


## Create sparse vectors using BM25 for complete, track, artist, album, playlist

In [None]:
from fastembed import TextEmbedding, LateInteractionTextEmbedding, SparseTextEmbedding
import numpy as np

# Deduplicate tracks based on track_name and artist_name
data_dedup = data.drop_duplicates(subset=['track_name', 'artist_name'], keep='first').reset_index(drop=True)
print(f"Original number of tracks: {len(data)}")
print(f"Deduplicated number of tracks: {len(data_dedup)}")

# Prepare track names for embeddings
track_names = data_dedup['track_name'].fillna('').astype(str).tolist()

# Initialize embedding models
#dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
bm25_embedding_model = SparseTextEmbedding("Qdrant/bm25")
#late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

print("Created embedding model")
print(f"Number of tracks: {len(track_names)}")

# Print a few example track names
print("\nExample track names:")
for i in range(min(5, len(track_names))):
    print(f"  Track {i}: {track_names[i]}")

bm25_embeddings = list(bm25_embedding_model.embed(doc for doc in track_names))

# Print some example embeddings
print("\nExample BM25 embeddings:")
for i in range(min(3, len(bm25_embeddings))):
    embedding = bm25_embeddings[i]
    print(f"\nTrack {i}: {track_names[i]}")
    print(f"  Embedding type: {type(embedding)}")
    print(f"  Number of non-zero values: {len(embedding.values)}")
    print(f"  First 10 indices: {embedding.indices[:10]}")
    print(f"  First 10 values: {embedding.values[:10]}")

Original number of tracks: 6615091
Deduplicated number of tracks: 674043
Created embedding model
Number of tracks: 674043

Example track names:
  Track 0: Money Longer
  Track 1: Broccoli (feat. Lil Yachty)
  Track 2: Still Here
  Track 3: You Was Right
  Track 4: Don't Hurt Me

Example BM25 embeddings:

Track 0: Money Longer
  Embedding type: <class 'fastembed.sparse.sparse_embedding_base.SparseEmbedding'>
  Number of non-zero values: 2
  First 10 indices: [729929402 408354071]
  First 10 values: [1.68320383 1.68320383]

Track 1: Broccoli (feat. Lil Yachty)
  Embedding type: <class 'fastembed.sparse.sparse_embedding_base.SparseEmbedding'>
  Number of non-zero values: 4
  First 10 indices: [1695274324  776767077  129402658   13849287]
  First 10 values: [1.67419738 1.67419738 1.67419738 1.67419738]

Track 2: Still Here
  Embedding type: <class 'fastembed.sparse.sparse_embedding_base.SparseEmbedding'>
  Number of non-zero values: 1
  First 10 indices: [2142141949]
  First 10 values: [1.

In [6]:
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.http.models import PointStruct
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url="https://3ed0dfdc-3f3f-4424-9123-87025b9842df.us-east4-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.cnfVaM94fTBzg_LyAnq_Ip1mGLX7WlCy44QqoJNJhzY",
    timeout=60
)

COLLECTION_NAME = "track_embeddings_bm25"

# Create collection with sparse vectors
#if not qdrant_client.collection_exists(COLLECTION_NAME):
qdrant_client.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={},  # Empty dict for dense vectors
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF
            )
        }
    )
#print(f"Created collection '{COLLECTION_NAME}'")
#else:
#print(f"Collection '{COLLECTION_NAME}' already exists")


# Upload embeddings to Qdrant (do this once)
print("Uploading embeddings to Qdrant...")
BATCH_SIZE = 1000

for i in range(0, len(data_dedup), BATCH_SIZE):
    batch_points = []
    for j in range(i, min(i + BATCH_SIZE, len(data_dedup))):
        batch_points.append(
            PointStruct(
                id=j,
                vector={"bm25": bm25_embeddings[j].as_object()},  # Use models.SparseVector
                payload={
                    "track_name": data_dedup.iloc[j]['track_name']
                }
            )
        )
    
    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=batch_points
    )
    
    if (i // BATCH_SIZE + 1) % 10 == 0:
        print(f"Uploaded {i + BATCH_SIZE} / {len(data_dedup)} embeddings...")

print(f"Successfully uploaded {len(data_dedup)} embeddings to Qdrant!")

  qdrant_client.recreate_collection(


Uploading embeddings to Qdrant...
Uploaded 10000 / 674043 embeddings...
Uploaded 20000 / 674043 embeddings...
Uploaded 30000 / 674043 embeddings...
Uploaded 40000 / 674043 embeddings...
Uploaded 50000 / 674043 embeddings...
Uploaded 60000 / 674043 embeddings...
Uploaded 70000 / 674043 embeddings...
Uploaded 80000 / 674043 embeddings...
Uploaded 90000 / 674043 embeddings...
Uploaded 100000 / 674043 embeddings...
Uploaded 110000 / 674043 embeddings...
Uploaded 120000 / 674043 embeddings...
Uploaded 130000 / 674043 embeddings...
Uploaded 140000 / 674043 embeddings...
Uploaded 150000 / 674043 embeddings...
Uploaded 160000 / 674043 embeddings...
Uploaded 170000 / 674043 embeddings...
Uploaded 180000 / 674043 embeddings...
Uploaded 190000 / 674043 embeddings...
Uploaded 200000 / 674043 embeddings...
Uploaded 210000 / 674043 embeddings...
Uploaded 220000 / 674043 embeddings...
Uploaded 230000 / 674043 embeddings...
Uploaded 240000 / 674043 embeddings...
Uploaded 250000 / 674043 embeddings...


In [18]:
# Fast search function - UPDATED
def search_qdrant(query, top_k=10):
    """Fast search using Qdrant sparse vectors"""
    # Get the embedding - query_embed returns a generator, so get the first item
    query_embedding = next(bm25_embedding_model.query_embed(query))
    
    # Convert to Qdrant SparseVector format
    sparse_query = models.SparseVector(
        indices=query_embedding.indices.tolist(),
        values=query_embedding.values.tolist()
    )
    
    # Use query_points instead of deprecated search method
    results = qdrant_client.query_points(
        collection_name=COLLECTION_NAME,
        query=sparse_query,  # Direct SparseVector, not tuple
        limit=top_k,
        using="bm25",  # Specify which sparse vector to use
        with_payload=True
    )
    
    return [
        {
            'track_name': hit.payload['track_name'],
            #'artist_name': hit.payload['artist_name'],
            'similarity_score': hit.score
        }
        for hit in results.points
    ]

# Test search
print("\nTesting search...")
results = search_qdrant("Momentum", top_k=5)
for i, result in enumerate(results, 1):
    print(f"{i}. {result['track_name']} (score: {result['similarity_score']:.4f})")



Testing search...
1. Momentum (score: 16.9193)
2. Momentum (score: 16.9193)
3. Momentum (score: 16.9193)
4. Momentum (score: 16.9193)
5. Momentum (score: 16.9193)
