In [1]:
import pandas as pd

In [3]:
data = pd.read_csv("/home/romain/Documents/Personal/Projects/Soundtrack_ML/soundtrack-ml/search/data/processed/track_playlist/tracks_data.csv")
data.head()

Unnamed: 0,playlist_id,playlist_name,track_name,artist_name,album_name,duration_ms,pos
0,61000,glow,Money Longer,Lil Uzi Vert,Lil Uzi Vert Vs. The World,198944,0
1,61000,glow,Broccoli (feat. Lil Yachty),DRAM,Big Baby DRAM,225205,1
2,61000,glow,Still Here,Drake,Views,189853,2
3,61000,glow,You Was Right,Lil Uzi Vert,Lil Uzi Vert Vs. The World,163944,3
4,61000,glow,Don't Hurt Me,DJ Mustard,Don't Hurt Me,192995,4


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Deduplicate tracks based on track_name and artist_name
data_dedup = data.drop_duplicates(subset=['track_name', 'artist_name'], keep='first').reset_index(drop=True)
print(f"Original number of tracks: {len(data)}")
print(f"Deduplicated number of tracks: {len(data_dedup)}")

# Create sparse vector embeddings using TF-IDF
vectorizer = TfidfVectorizer(
    analyzer='char_wb',  # Character n-grams to handle track name variations
    ngram_range=(2, 4),  # Use 2-4 character n-grams
    min_df=1,
    lowercase=True
)

# Fit and transform track names to sparse vectors
track_names = data_dedup['track_name'].fillna('').astype(str)
sparse_embeddings = vectorizer.fit_transform(track_names)

print(f"Created sparse embeddings with shape: {sparse_embeddings.shape}")
print(f"Number of tracks: {len(track_names)}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# Print a few key-value examples from the vocabulary
print("\nExample vocabulary entries (feature -> index):")
vocab_items = list(vectorizer.vocabulary_.items())[:30]
for feature, idx in vocab_items:
    print(f"  '{feature}': {idx}")


Original number of tracks: 6615091
Deduplicated number of tracks: 674043
Created sparse embeddings with shape: (674043, 248480)
Number of tracks: 674043
Vocabulary size: 248480

Example vocabulary entries (feature -> index):
  ' m': 12630
  'mo': 133198
  'on': 147503
  'ne': 137696
  'ey': 94333
  'y ': 192497
  ' mo': 13035
  'mon': 133415
  'one': 147756
  'ney': 138248
  'ey ': 94334
  ' mon': 13055
  'mone': 133430
  'oney': 147796
  'ney ': 138249
  ' l': 12080
  'lo': 127210
  'ng': 138450
  'ge': 99368
  'er': 91743
  'r ': 156998
  ' lo': 12386
  'lon': 127452
  'ong': 147813
  'nge': 138645
  'ger': 99623
  'er ': 91744
  ' lon': 12407
  'long': 127467
  'onge': 147833


In [None]:
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.http.models import PointStruct
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url="https://3ed0dfdc-3f3f-4424-9123-87025b9842df.us-east4-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.cnfVaM94fTBzg_LyAnq_Ip1mGLX7WlCy44QqoJNJhzY",
    timeout=60
)

COLLECTION_NAME = "track_sparse_embeddings"

# Create collection with sparse vectors
#if not qdrant_client.collection_exists(COLLECTION_NAME):
qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={},  # Empty dict for dense vectors
        sparse_vectors_config={
            "sparse_vector": models.SparseVectorParams(
                index=models.SparseIndexParams()
            )
        }
    )
#print(f"Created collection '{COLLECTION_NAME}'")
#else:
#print(f"Collection '{COLLECTION_NAME}' already exists")

# Convert TF-IDF sparse matrix to Qdrant SparseVector format
def convert_to_qdrant_sparse(sparse_matrix_row):
    """Convert scipy sparse matrix row to Qdrant SparseVector format"""
    cx = sparse_matrix_row.tocoo()
    indices = cx.col.astype(np.uint32)  # Qdrant expects uint32 for indices
    values = cx.data.astype(np.float32)  # Values as float32
    return models.SparseVector(
        indices=indices.tolist(),
        values=values.tolist()
    )

# Upload embeddings to Qdrant (do this once)
print("Uploading embeddings to Qdrant...")
BATCH_SIZE = 1000

for i in range(0, len(data_dedup), BATCH_SIZE):
    batch_points = []
    for j in range(i, min(i + BATCH_SIZE, len(data_dedup))):
        sparse_vec = convert_to_qdrant_sparse(sparse_embeddings[j])
        
        batch_points.append(
            PointStruct(
                id=j,
                vector={"sparse_vector": sparse_vec},  # Use models.SparseVector
                payload={
                    "track_name": data_dedup.iloc[j]['track_name'],
                    "artist_name": data_dedup.iloc[j]['artist_name']
                }
            )
        )
    
    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=batch_points
    )
    
    if (i // BATCH_SIZE + 1) % 10 == 0:
        print(f"Uploaded {i + BATCH_SIZE} / {len(data_dedup)} embeddings...")

print(f"Successfully uploaded {len(data_dedup)} embeddings to Qdrant!")

KeyboardInterrupt: 

In [70]:
# Fast search function - UPDATED
def search_qdrant(query, top_k=10):
    """Fast search using Qdrant sparse vectors"""
    query_vector = vectorizer.transform([query])
    sparse_query = convert_to_qdrant_sparse(query_vector[0])
    
    # Use query_points instead of deprecated search method
    results = qdrant_client.query_points(
        collection_name=COLLECTION_NAME,
        query=sparse_query,  # Direct SparseVector, not tuple
        limit=top_k,
        using="sparse_vector",  # Specify which sparse vector to use
        with_payload=True
    )
    
    return [
        {
            'track_name': hit.payload['track_name'],
            'artist_name': hit.payload['artist_name'],
            'similarity_score': hit.score
        }
        for hit in results.points
    ]

# Test search
print("\nTesting search...")
results = search_qdrant("broccoli", top_k=5)
for i, result in enumerate(results, 1):
    print(f"{i}. {result['track_name']} by {result['artist_name']} (score: {result['similarity_score']:.4f})")



Testing search...
1. Broccoli - Remix by KMX (score: 0.9400)
2. Allergic Reaction to Broccoli - Broccoli Bars 1 by Dan Bull (score: 0.7783)
3. Beef n Broccoli by DeJ Loaf (score: 0.7468)
4. Beef & Broccoli by Immortal Technique (score: 0.7468)
5. Broccoli (feat. Lil Yachty) by DRAM (score: 0.6551)
