## Content-Based Music Recommendation (Nearest Neighbors)

This section builds a **content-based** recommender using song/audio metadata from `df_clean`. We use a Nearest Neighbors model over engineered features and return the **10 most similar songs** for a given input song.

In [1]:
# ============================================================================
# 1) Imports
# ============================================================================

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings('ignore')

print('Imports ready.')

Imports ready.


In [2]:
# ============================================================================
# 2) Load df_clean
# ============================================================================

df_clean_path = r"D:\UNH Materials\Projects\Spotify Song Recommendations\data\df_clean.csv"
df_clean = pd.read_csv(df_clean_path)

print('df_clean loaded:', df_clean.shape)
display(df_clean.head())

df_clean loaded: (85000, 32)


Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,genre,duration_ms,popularity,danceability,energy,...,release_week,release_day_of_week,duration_min,popularity_bin,stream_category,key_label,energy_danceability,popularity_energy,instrumentalness_energy,mode_label
0,TRK-BEBD53DA84E1,Agent every (0),Noah Rhodes,Beautiful instead,2016-04-01,Pop,234194,55,0.15,0.74,...,13,Friday,3.903233,medium,low_stream,A,0.111,0.407,0.32264,Minor
1,TRK-6A32496762D7,Night respond,Jennifer Cole,Table,2022-04-15,Metal,375706,45,0.44,0.46,...,15,Friday,6.261767,medium,low_stream,C,0.2024,0.207,0.10258,Minor
2,TRK-47AA7523463E,Future choice whatever,Brandon Davis,Page southern,2016-02-23,Rock,289191,55,0.62,0.8,...,8,Tuesday,4.81985,medium,low_stream,G#,0.496,0.44,0.4672,Major
3,TRK-25ADA22E3B06,Bad fall pick those,Corey Jones,Spring,2015-10-12,Pop,209484,51,0.78,0.98,...,42,Monday,3.4914,medium,low_stream,C#,0.7644,0.4998,0.67032,Major
4,TRK-9245F2AD996A,Husband,Mark Diaz,Great prove,2022-07-08,Indie,127435,39,0.74,0.18,...,27,Friday,2.123917,medium,low_stream,A#,0.1332,0.0702,0.05472,Minor


In [57]:
# ============================================================================
# 3) Feature selection (content-based)
#    We avoid IDs/names for modeling, but keep them for display.
# ============================================================================

# Strong content signals: audio + a bit of metadata
numerical_features = [
    'danceability', 'energy', 'loudness', 
    'instrumentalness', 'tempo', 'popularity'
]

categorical_features = [
    # 'genre', 'country', 'label'
    # 'genre'
]

# Keep only columns that exist
numerical_features = [c for c in numerical_features if c in df_clean.columns]
categorical_features = [c for c in categorical_features if c in df_clean.columns]

required_id_cols = ['track_id', 'track_name', 'artist_name']
available_id_cols = [c for c in required_id_cols if c in df_clean.columns]

print('Numerical features:', numerical_features)
print('Categorical features:', categorical_features)
print('ID/display columns:', available_id_cols)

# Basic cleaning for modeling
model_df = df_clean[available_id_cols + numerical_features + categorical_features].copy()

# Fill missing values
for c in numerical_features:
    model_df[c] = model_df[c].fillna(model_df[c].median())
for c in categorical_features:
    model_df[c] = model_df[c].fillna('Unknown')

print('Modeling dataframe:', model_df.shape)
display(model_df.head())

Numerical features: ['danceability', 'energy', 'loudness', 'instrumentalness', 'tempo', 'popularity']
Categorical features: []
ID/display columns: ['track_id', 'track_name', 'artist_name']
Modeling dataframe: (85000, 9)


Unnamed: 0,track_id,track_name,artist_name,danceability,energy,loudness,instrumentalness,tempo,popularity
0,TRK-BEBD53DA84E1,Agent every (0),Noah Rhodes,0.15,0.74,-32.22,0.436,73.12,55
1,TRK-6A32496762D7,Night respond,Jennifer Cole,0.44,0.46,-14.02,0.223,157.74,45
2,TRK-47AA7523463E,Future choice whatever,Brandon Davis,0.62,0.8,-48.26,0.584,71.03,55
3,TRK-25ADA22E3B06,Bad fall pick those,Corey Jones,0.78,0.98,-34.47,0.684,149.0,51
4,TRK-9245F2AD996A,Husband,Mark Diaz,0.74,0.18,-17.84,0.304,155.85,39


In [58]:
# ============================================================================
# 4) Build feature matrix + fit Nearest Neighbors model
# ============================================================================

# Preprocessing: scale numericals, one-hot encode categoricals
preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='drop'
)

# We fit a NearestNeighbors model on the transformed feature space
nn_model = NearestNeighbors(metric='cosine', algorithm='auto')

pipe = Pipeline([
    ('preprocess', preprocess),
    ('nn', nn_model)
])

X = model_df[numerical_features + categorical_features]
pipe.fit(X)

print('NearestNeighbors model fit complete.')

NearestNeighbors model fit complete.


In [59]:
# ============================================================================
# 5) Recommendation function
#    Input can be track_name (recommended) or track_id.
# ============================================================================

# Build quick lookup indices
name_to_idx = None
id_to_idx = None

if 'track_name' in model_df.columns:
    # If duplicates exist, we keep the first occurrence
    name_to_idx = pd.Series(model_df.index.values, index=model_df['track_name'].astype(str)).groupby(level=0).first().to_dict()

if 'track_id' in model_df.columns:
    id_to_idx = pd.Series(model_df.index.values, index=model_df['track_id'].astype(str)).groupby(level=0).first().to_dict()


def recommend_songs(song, k=10, verbose=True):
    """Return k closest songs for a given input song (track_name or track_id).

    Parameters
    ----------
    song : str
        A track name (e.g., "Blinding Lights") OR a track_id.
    k : int
        Number of recommendations to return.
    verbose : bool
        If True, prints the matched input row.

    Returns
    -------
    pd.DataFrame
        Top-k similar songs with similarity score.
    """
    if song is None or str(song).strip() == '':
        raise ValueError('Please provide a non-empty song name or track_id.')

    song = str(song)

    # Resolve index
    idx = None
    if id_to_idx is not None and song in id_to_idx:
        idx = id_to_idx[song]
    elif name_to_idx is not None and song in name_to_idx:
        idx = name_to_idx[song]
    else:
        # fallback: case-insensitive contains match on track_name
        if 'track_name' in model_df.columns:
            mask = model_df['track_name'].astype(str).str.lower().str.contains(song.lower(), na=False)
            if mask.any():
                idx = model_df.loc[mask].index[0]
                song = model_df.loc[idx, 'track_name']  # normalize to exact name

    if idx is None:
        raise KeyError(f"Song '{song}' not found. Try an exact track_name or a valid track_id.")

    if verbose:
        print('Matched input song:')
        cols_to_show = available_id_cols + (['genre'] if 'genre' in model_df.columns else [])
        display(model_df.loc[[idx], cols_to_show])

    # Query neighbors: ask for k+1 so we can drop the song itself
    query_X = model_df.loc[[idx], numerical_features + categorical_features]
    distances, indices = pipe.named_steps['nn'].kneighbors(
        pipe.named_steps['preprocess'].transform(query_X),
        n_neighbors=min(k + 1, len(model_df))
    )

    distances = distances.ravel()
    indices = indices.ravel()

    # Convert transformed-space indices back to original row indices
    # NearestNeighbors was fit on rows in the same order as model_df
    neighbor_df_indices = model_df.iloc[indices].index.values

    # Build results and drop self
    results = model_df.loc[neighbor_df_indices, :].copy()
    results['distance_cosine'] = distances
    results['similarity'] = 1 - results['distance_cosine']

    # Drop the input song itself (distance 0)
    results = results[results.index != idx]

    # Sort by similarity and return top k
    cols_out = []
    for c in ['track_name', 'artist_name', 'popularity', 'genre', 'country', 'label']:
        if c in results.columns:
            cols_out.append(c)
    if 'track_id' in results.columns:
        cols_out = ['track_id'] + cols_out

    out = results.sort_values('similarity', ascending=False).head(k)
    out = out[cols_out + ['similarity']]

    return out.reset_index(drop=True)


# Example usage (pick any exact track_name from df_clean):
# recommend_songs('Night respond')
print('Function recommend_songs(song, k=10) is ready.')

Function recommend_songs(song, k=10) is ready.


In [60]:
# Quick demo: recommend based on the first song in the dataset
example_song_name = df_clean['track_name'].astype(str).iloc[4302] if 'track_name' in df_clean.columns else df_clean['track_id'].astype(str).iloc[0]
example_song = df_clean[df_clean['track_name'] == example_song_name].iloc[0]
print("\n--- Testing Recommendation System ---")
print(f"\nOriginal Track:")
print(f"  Track: {example_song['track_name']}")
print(f"  Artist: {example_song['artist_name']}")
print(f"  Genre: {example_song['genre']}")
print(f"  Popularity: {example_song['popularity']}")

print(f"\nTop 10 Recommended Similar Tracks:")

recs = recommend_songs(example_song_name, k=10, verbose=False)
display(recs)


--- Testing Recommendation System ---

Original Track:
  Track: Week your your
  Artist: Steven Carroll
  Genre: EDM
  Popularity: 34

Top 10 Recommended Similar Tracks:


Unnamed: 0,track_id,track_name,artist_name,popularity,similarity
0,TRK-CDDF68DB8717,Several recent message behavior,Molly Figueroa,36,0.98963
1,TRK-CBE7326FE6D8,Court successful pull left,Courtney Burnett,29,0.98867
2,TRK-6CAA04876402,Thing offer,Melissa Garcia,33,0.988361
3,TRK-68D332736697,Yes now around,Bryan Ponce,36,0.986987
4,TRK-3F1D2041E164,Television,Linda Ramirez,35,0.985808
5,TRK-ABAB7BB2AA16,Television special,Abigail Davidson,35,0.985405
6,TRK-BB75B657729C,Job address,Michael Roberts,38,0.984826
7,TRK-B3201A8C04DA,Pick show,Gary Walsh,30,0.983919
8,TRK-9E85073BF716,Stuff,Carol Mendoza,35,0.983142
9,TRK-2B95E75E3557,Light trouble,Angela Rivera,40,0.982641
