Imports


In [1]:
#!pip install sentence-transformers spotipy

import os, numpy as np, pandas as pd
from typing import List, Dict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

# bounded-time client)
import spotipy, requests
from spotipy.oauth2 import SpotifyOAuth

  from .autonotebook import tqdm as notebook_tqdm


Spotify Config

In [2]:
useSpotify = True
spotifyTimeout = 1000

Data Loading

In [None]:
'''dataset downloaded from
#https://www.kaggle.com/datasets/carlosgdcj/genius-song-lyrics-with-language-information
'''

filePath = "song_lyrics.csv"
print("Loading:", filePath)

songs = pd.read_csv(filePath, encoding="latin1", nrows=50_000, low_memory=False)

print("Loaded", len(songs), "rows.")
print(songs.head())

Loading: song_lyrics.csv
Loaded 50000 rows.
               title  tag     artist  year   views  \
0          Killa Cam  rap    Cam'ron  2004  173166   
1         Can I Live  rap      JAY-Z  1996  468624   
2  Forgive Me Father  rap   Fabolous  2003    4743   
3       Down and Out  rap    Cam'ron  2004  144404   
4             Fly In  rap  Lil Wayne  2005   78271   

                                       features  \
0                   {"Cam\\'ron","Opera Steve"}   
1                                            {}   
2                                            {}   
3  {"Cam\\'ron","Kanye West","Syleena Johnson"}   
4                                            {}   

                                              lyrics  id language_cld3  \
0  [Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...   1            en   
1  [Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...   3            en   
2  Maybe cause I'm eatin\nAnd these bastards fien...   4            en   
3  [Produced by Kanye West a

Column Normalization

In [9]:

renameMap = {}
if "title" in songs.columns: renameMap["title"] = "name"
if "song"  in songs.columns and "name" not in songs.columns: renameMap["song"] = "name"
if "artist" in songs.columns: renameMap["artist"] = "artists"
if "lyrics" in songs.columns: renameMap["lyrics"] = "lyrics"
songs = songs.rename(columns=renameMap)

# ensure required columns exist
if "year" not in songs.columns and "release_date" in songs.columns:
    songs["year"] = pd.to_datetime(songs["release_date"], errors="coerce").dt.year
if "genres" not in songs.columns:
    songs["genres"] = ""

# optional: keep English & non-empty lyrics to manage memory
if "language" in songs.columns:
    songs = songs[songs["language"].astype(str).str.lower().eq("en")]
if "lyrics" in songs.columns:
    songs = songs[songs["lyrics"].astype(str).str.len() > 50]

# keep only the columns we use
keepCols = [c for c in ["name","artists","year","genres","lyrics","language"] if c in songs.columns]
songs = songs[keepCols].copy()

# build doc_text for TF-IDF
songs = songs.fillna({"lyrics": "", "genres": "", "artists": ""})
songs["doc_text"] = (
    songs.get("genres","").astype(str) + " " +
    songs.get("artists","").astype(str).str.replace(r"[\[\]']", " ", regex=True) + " " +
    songs.get("lyrics","").astype(str)
)
print("Shape after filtering:", songs.shape)


Shape after filtering: (49051, 7)


NLP Embedding

In [10]:
# 1) Sentence-BERT embeddings for lyrics (semantic, robust)
sbert = SentenceTransformer("all-MiniLM-L6-v2")
lyricEmb = sbert.encode(songs["lyrics"].tolist(), show_progress_bar=True, normalize_embeddings=True)
# 2) TF-IDF over genres+artists+lyrics (classic text similarity)
tfidf = TfidfVectorizer(
    lowercase=True, stop_words="english",
    min_df=3, max_df=0.9, ngram_range=(1,2)
)
tfidfMat = tfidf.fit_transform(songs["doc_text"].tolist())
tfidfMat = normalize(tfidfMat)

# Precompute NN indices (fast retrieval)
lyricNN = NearestNeighbors(metric="cosine", algorithm="brute").fit(lyricEmb)
metaNN  = NearestNeighbors(metric="cosine", algorithm="brute").fit(tfidfMat)

Batches: 100%|██████████| 1533/1533 [28:23<00:00,  1.11s/it] 


Spotify Re-ranking

In [12]:
sp = None
if useSpotify:
    sp = spotipy.Spotify(
        auth_manager=SpotifyOAuth(
            client_id="ceafc3a577854a5ea64de4d68d74f1d2",
            client_secret="a783732dd6bf4160b0ff6003676ada55",
            redirect_uri="http://127.0.0.1:8888/callback",
            scope=""  # no special scopes needed for audio_features
        ),
        requests_timeout=spotifyTimeout, retries=2
    )

def fetchSpotifyFeatures(trackName: str, year: int | None = None, market="US") -> Dict | None:
    if not sp: return None
    try:
        q = f"track:{trackName}" + (f" year:{int(year)}" if pd.notnull(year) else "")
        r = sp.search(q=q, type="track", limit=1, market=market)
        items = (r or {}).get("tracks", {}).get("items", [])
        if not items: return None
        tid = items[0]["id"]
        feats = sp.audio_features([tid])[0]
        return feats
    except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout):
        return None
    except Exception:
        return None

Helpers

In [16]:
def cosineSimVec(a: np.ndarray, b: np.ndarray) -> float:
    d = (a*b).sum()
    na, nb = norm(a) + 1e-12, norm(b) + 1e-12
    return float(d/(na*nb))

def textToEmbed(query: str) -> np.ndarray:
    return sbert.encode([query], normalize_embeddings=True)[0]

def songRowToEmbed(row) -> np.ndarray:
    return sbert.encode([row["lyrics"]], normalize_embeddings=True)[0]

def hybridScore(rowIdxs, qLyricVec, qTfidfRow, wLyrics=0.7, wMeta=0.3,
                audioBias: Dict[str, float] | None = None):
    # lyric similarity
    lyr = (lyricEmb[rowIdxs] @ qLyricVec)  # already normalized → dot = cosine
    # metadata similarity
    meta = (tfidfMat[rowIdxs]).dot(qTfidfRow.T).toarray().ravel()
    score = wLyrics*lyr + wMeta*meta
    # optional: nudge by audio targets (tempo/energy/valence)
    if audioBias:
        pass
    return score

Text/Seed Based Recommedn

In [17]:
def recommendByText(queryText: str, k: int = 10, wLyrics=0.7, wMeta=0.3) -> pd.DataFrame:
    qVec = textToEmbed(queryText)
    # nearest by lyrics to prune candidate set
    k0 = min(200, len(songs))
    _, idx = lyricNN.kneighbors([qVec], n_neighbors=k0)
    candIdx = idx[0]
    # build a TF-IDF row for the query (use vectorizer’s vocab)
    qTfidf = tfidf.transform([queryText])
    scores = hybridScore(candIdx, qVec, qTfidf, wLyrics=wLyrics, wMeta=wMeta)
    top = candIdx[np.argsort(scores)[::-1][:k]]
    return songs.iloc[top][["name","artists","year","genres"]].reset_index(drop=True)

def recommendBySeed(name: str, year: int | None = None, k: int = 10, wLyrics=0.7, wMeta=0.3) -> pd.DataFrame:
    # try to find the seed row locally (exact on name, +/- 1yr tolerance)
    m = songs[songs["name"].str.lower().eq(name.lower())]
    if year is not None:
        m = m[m["year"].between(int(year)-1, int(year)+1, inclusive="both")]
    if not m.empty and m.iloc[0]["lyrics"]:
        row = m.iloc[0]
        qVec = songRowToEmbed(row)
        seedText = f"{row['genres']} {row['artists']} {row['lyrics']}"
        qTfidf = tfidf.transform([seedText])
    else:
        # fall back to a text query (lyrics unavailable) using title and artists
        queryText = f"{name} {'' if year is None else year}"
        qVec = textToEmbed(queryText)  # semantic text query
        qTfidf = tfidf.transform([queryText])

    # prune by lyric NN, then hybrid re-rank
    k0 = min(200, len(songs))
    _, idx = lyricNN.kneighbors([qVec], n_neighbors=k0)
    candIdx = idx[0]
    scores = hybridScore(candIdx, qVec, qTfidf, wLyrics=wLyrics, wMeta=wMeta)
    top = candIdx[np.argsort(scores)[::-1]]
    # drop the seed itself if present
    out = songs.iloc[top][["name","artists","year","genres"]]
    out = out[~out["name"].str.lower().eq(name.lower())]
    return out.head(k).reset_index(drop=True)

Example

In [18]:
# Natural-language query (pure NLP):
print(recommendByText("moody indie for a rainy night, female vocals, low tempo", k=10))

# Seed song (uses its lyrics/metadata if present, else semantic title query):
print(recommendBySeed("Bloody Sweet", year=2023, k=10))

                                  name                  artists  year genres
0                         Slow and Low             Beastie Boys  1986       
1                          Rainy Rainy                 Fat Boys  1989       
2                          Let It Rain                  Heavy D  1991       
3                            Freestyle  Lateef the Truthspeaker  2007       
4           Rock Steady DJ Clue? Remix            Mary J. Blige  2001       
5  The Roof Back in Time Mobb Deep Mix             Mariah Carey  1998       
6                     Situation: Grimm              Mista Grimm  1995       
7                        The Trap Door                Holocaust  2006       
8                             The Pain                     Murs  2004       
9        Doin Time Marshall Arts Remix                  Sublime  1997       
                                  name  \
0                                 2015   
1                   Sweet Premium Wine   
2                         T