# SongSearch
using spotify song data for similarity search

In [241]:
import pandas as pd
import numpy as np

---
## Dataset
uploaded december 2023

source: https://www.kaggle.com/datasets/joebeachcapital/30000-spotify-songs

### Import the song data

In [242]:
csv_file = "../dataset/spotify_songs.csv"
df = pd.read_csv(csv_file)
print(f'No of songs: {df.shape[0]}, No of columns: {df.shape[1]}') # print shape of dataset

No of songs: 32833, No of columns: 23


In [243]:
df.head() # print first 5 rows

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [244]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudne

In [245]:
df.describe()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0
mean,42.477081,0.65485,0.698619,5.374471,-6.719499,0.565711,0.107068,0.175334,0.084747,0.190176,0.510561,120.881132,225799.811622
std,24.984074,0.145085,0.18091,3.611657,2.988436,0.495671,0.101314,0.219633,0.22423,0.154317,0.233146,26.903624,59834.006182
min,0.0,0.0,0.000175,0.0,-46.448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000.0
25%,24.0,0.563,0.581,2.0,-8.171,0.0,0.041,0.0151,0.0,0.0927,0.331,99.96,187819.0
50%,45.0,0.672,0.721,6.0,-6.166,1.0,0.0625,0.0804,1.6e-05,0.127,0.512,121.984,216000.0
75%,62.0,0.761,0.84,9.0,-4.645,1.0,0.132,0.255,0.00483,0.248,0.693,133.918,253585.0
max,100.0,0.983,1.0,11.0,1.275,1.0,0.918,0.994,0.994,0.996,0.991,239.44,517810.0


---


### Processing the dataset:
Process genres

In [246]:
# Get the unique genres and subgenres
unique_genres = df['playlist_genre'].unique()
unique_subgenres = df['playlist_subgenre'].unique()

# Create a dictionary to map genres to numbers
genre_mapping = {genre: i * 100 for i, genre in enumerate(unique_genres)}
genre_mapping['Other'] = 0  # Assign 0 to 'Other' genre

# Create a dictionary to map subgenres to numbers
subgenre_mapping = {}
for genre in unique_genres:
    subgenres = df[df['playlist_genre'] == genre]['playlist_subgenre'].unique()
    for i, subgenre in enumerate(subgenres):
        subgenre_mapping[subgenre] = genre_mapping[genre] + i + 1

subgenre_mapping['Other'] = 0  # Assign 0 to 'Other' subgenre

# Create new columns 'genre_id' and 'subgenre_id' with the assigned numbers
df['genre_id'] = df['playlist_genre'].map(genre_mapping)
df['genre_id'] = df['genre_id'].fillna(0).astype(int)
df['subgenre_id'] = df['playlist_subgenre'].map(subgenre_mapping)
df['subgenre_id'] = df['subgenre_id'].fillna(0).astype(int)

Create song embeddings from characteristics

In [302]:
# select columns that will be included in our song embedding
embedding_headers = ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]
# embedding_headers = ["genre_id", "subgenre_id", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

# store the embeddings
embedding_df =  df[embedding_headers]

Normalizing the embeddings using standardization

In [303]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() # Create an instance of StandardScaler
embedding_matrix = scaler.fit_transform(embedding_df) # Fit and transform the embeddings
print(embedding_matrix[0])

[ 0.64204909  1.20161406  0.1731999   1.36712341  0.87617693 -0.48136238
 -0.33389784 -0.37795302 -0.80922951  0.03190765  0.04292678]


---
## Similarity Search

Cosine similarity:

    cosine_similarity_vec(songA, songB) = dot_product(A,B) / (norm(A) * norm(B))
    cosine_similarity_matrix(songsA, songB) = dot_product(A,B) / (norm(A) * norm(B))

In [304]:
def cosine_similarity_vec(vecA, vecB):
    dp = np.dot(vecA, vecB) # dot product of vector A and B
    # Compute the L2 norms (Euclidean lengths) (Frobenius norm) of the vectors
    norm1 = np.linalg.norm(vecA)
    norm2 = np.linalg.norm(vecA)
    return dp / (norm1 * norm2)

def cosine_similarity_matrix_vec(matrixA, vecB):
    dp = np.dot(matrixA, vecB) # dot product of matrix A and vector B
    # Compute the L2 norms (Euclidean lengths) (Frobenius norm) of the matirx A and vector B
    norm1 = np.linalg.norm(matrixA, axis=1)
    norm2 = np.linalg.norm(vecB)
    return dp / (norm1 * norm2)

def cosine_similarity_matrix(matrixA, matrixB):
    dp = np.dot(matrixA, matrixB) # dot product of matrix A and vector B
    # Compute the L2 norms (Euclidean lengths) (Frobenius norm) of the matirx A and matrix B
    norm1 = np.linalg.norm(matrixA, axis=1)
    norm2 = np.linalg.norm(matrixA, axis=1)
    return dp / (norm1 * norm2)

Softmax function:

In [305]:
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x)

---

Run similarity search

In [327]:
%%time
# Find the cosines of the first song wrt to every other song
id = 30000
song = df.iloc[id]
print(f'ID: {id}, Name: {song["track_name"]}, Artist(s): {song["track_artist"]}')
print(f'Genre: [{song["playlist_genre"]}, {song["playlist_subgenre"]}]\n')
res = cosine_similarity_matrix_vec(embedding_matrix, embedding_matrix[id])
# softmax_res = softmax(res)

ID: 30000, Name: My House, Artist(s): Flo Rida
Genre: [edm, pop edm]

CPU times: user 89.4 ms, sys: 15.1 ms, total: 105 ms
Wall time: 17.1 ms


In [328]:
%%time
k = 20  # Number of top similar songs to retrieve
top_k_idx = np.argsort(res)[-k:][::-1][1:] # Get the indices of the top k similar songs and ignore first song
top_k_scores = res[top_k_idx] # Get the scores of the top k similar songs

CPU times: user 3.64 ms, sys: 2.32 ms, total: 5.96 ms
Wall time: 2.97 ms


Display metadata of top k songs

In [329]:
for idx, score in zip(top_k_idx, top_k_scores):
    song = df.iloc[idx]
    print(f'ID: {idx}, Name: {song["track_name"]}, Artist(s): {song["track_artist"]}')
    print(f'Genre: [{song["playlist_genre"]}, {song["playlist_subgenre"]}], Score: {score}\n')
    # print(f'{embedding_matrix[i]}\n')

ID: 30000, Name: My House, Artist(s): Flo Rida
Genre: [edm, pop edm], Score: 1.0

ID: 17859, Name: No Te Vayas, Artist(s): Camilo
Genre: [latin, latin pop], Score: 0.9531604374003954

ID: 23672, Name: Per Un Milione, Artist(s): Boomdabash
Genre: [r&b, hip pop], Score: 0.9513270659052938

ID: 14648, Name: Dirt in my Eyes, Artist(s): Cold War Kids
Genre: [rock, permanent wave], Score: 0.9509304064943395

ID: 25030, Name: Tell Me How You Feel - Radio Mix, Artist(s): Joy Enriquez
Genre: [r&b, new jack swing], Score: 0.9463832653448154

ID: 5325, Name: I Took A Pill In Ibiza - Seeb Remix, Artist(s): Mike Posner
Genre: [pop, indie poptimism], Score: 0.9403111870877422

ID: 3712, Name: I Took A Pill In Ibiza - Seeb Remix, Artist(s): Mike Posner
Genre: [pop, electropop], Score: 0.9403111870877422

ID: 30108, Name: I Took A Pill In Ibiza - Seeb Remix, Artist(s): Mike Posner
Genre: [edm, pop edm], Score: 0.9403111870877422

ID: 2620, Name: Blow That Smoke (feat. Tove Lo), Artist(s): Major Lazer
