In [1]:
! pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [2]:
# data analysis
import numpy as np
import pandas as pd

# data viz
import matplotlib.pyplot as plt
import seaborn as sns

# model
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from category_encoders.count import CountEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity

# os
from pathlib import Path

# kaggle
import kagglehub

In [3]:
kagglehub.dataset_download("undefinenull/million-song-dataset-spotify-lastfm")

'/kaggle/input/million-song-dataset-spotify-lastfm'

In [47]:
dataset_path = Path("/kaggle/input/million-song-dataset-spotify-lastfm")

In [48]:
songs_path = dataset_path / 'Music Info.csv'

user_history_path = dataset_path / 'User Listening History.csv'

In [49]:
pd.set_option('display.max_columns', None)

In [50]:
df_songs_raw = pd.read_csv(songs_path)
df_songs_raw.head()

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,0.918,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,0.892,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,0.826,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,0.664,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,0.43,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [51]:
df_users_raw = pd.read_csv(user_history_path)
df_users_raw.head()

Unnamed: 0,track_id,user_id,playcount
0,TRIRLYL128F42539D1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
1,TRFUPBA128F934F7E1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
2,TRLQPQJ128F42AA94F,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
3,TRTUCUY128F92E1D24,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
4,TRHDDQG12903CB53EE,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1


### **Songs Dataset**

In [52]:
print(df_songs_raw.shape)

(50683, 21)


In [53]:
df_songs_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50683 entries, 0 to 50682
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   track_id             50683 non-null  object 
 1   name                 50683 non-null  object 
 2   artist               50683 non-null  object 
 3   spotify_preview_url  50683 non-null  object 
 4   spotify_id           50683 non-null  object 
 5   tags                 49556 non-null  object 
 6   genre                22348 non-null  object 
 7   year                 50683 non-null  int64  
 8   duration_ms          50683 non-null  int64  
 9   danceability         50683 non-null  float64
 10  energy               50683 non-null  float64
 11  key                  50683 non-null  int64  
 12  loudness             50683 non-null  float64
 13  mode                 50683 non-null  int64  
 14  speechiness          50683 non-null  float64
 15  acousticness         50683 non-null 

In [54]:
df_songs_raw.isna().sum().sort_values(ascending = False).head(3)

Unnamed: 0,0
genre,28335
tags,1127
artist,0


There are 55% **Missing Values** in `genre` and 2% Missing values in `tags`.

In [55]:
missing_percent = (df_songs_raw.isnull().sum() / df_songs_raw.shape[0]) * 100

In [56]:
missing_percent.sort_values(ascending = False).head(3)

Unnamed: 0,0
genre,55.90632
tags,2.223625
artist,0.0


In [57]:
df_songs_raw.drop_duplicates(subset=["spotify_id", "year", "duration_ms"], inplace = True)
df_songs_raw.head()

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,0.918,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,0.892,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,0.826,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,0.664,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,0.43,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [58]:
# removing columns not required for collaborative filtering

cols_to_drop = ["track_id", "name", "spotify_preview_url", "spotify_id", "genre"]
df_features = df_songs_raw.drop(columns=cols_to_drop).copy()

In [59]:
df_features.head()

Unnamed: 0,artist,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,The Killers,"rock, alternative, indie, alternative_rock, in...",2004,222200,0.355,0.918,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,Oasis,"rock, alternative, indie, pop, alternative_roc...",2006,258613,0.409,0.892,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,Nirvana,"rock, alternative, alternative_rock, 90s, grunge",1991,218920,0.508,0.826,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,Franz Ferdinand,"rock, alternative, indie, alternative_rock, in...",2004,237026,0.279,0.664,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,Radiohead,"rock, alternative, indie, alternative_rock, in...",2008,238640,0.515,0.43,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [60]:
# check for missing values
df_features.isna().sum().sort_values(ascending = False).head(2)

Unnamed: 0,0
tags,1126
artist,0


In [61]:
df_features.fillna({"tags": "no_tags"}, inplace=True)
df_features["artist"] = df_features["artist"].str.lower()

In [62]:
# fill the tags column missing values with string "no_tags"
df_features.fillna({"tags": "no_tags"}, inplace=True)

In [63]:
# check for missing values
df_features["artist"] = df_features["artist"].str.lower()

In [64]:
tag_counts = (
    df_songs_raw["tags"].str.lower()
    .str.split(',')
    .explode()
    .str.strip()
    .value_counts()
)

popular_tags = tag_counts[tag_counts >= 1000]

In [65]:
popular_tags = tag_counts[tag_counts >= 1000]

In [67]:
popular_tags.head(5)

Unnamed: 0_level_0,count
tags,Unnamed: 1_level_1
rock,10681
indie,7284
electronic,6592
alternative,6271
pop,4650


In [68]:
freq_enc_cols = ['year']
one_hot_cols = ['artist', 'time_signature', 'key']
tfidf_text_col = 'tags'
std_scale_cols = ["duration_ms", "loudness", "tempo"]
minmax_scale_cols = ["danceability", "energy", "speechiness", "acousticness",
                     "instrumentalness", "liveness", "valence"]

total_trans_cols = len(freq_enc_cols + one_hot_cols + std_scale_cols + minmax_scale_cols)
print("Total features for transformation:", total_trans_cols)

Total features for transformation: 14


In [69]:
preprocessor = ColumnTransformer(transformers = [
    ("freq_enc", CountEncoder(normalize = True, return_df = True), freq_enc_cols),
    ("ohe", OneHotEncoder(handle_unknown = "ignore"), one_hot_cols),
    ("tfidf", TfidfVectorizer(max_features = 85), tfidf_text_col),
    ("std_scaler", StandardScaler(), std_scale_cols),
    ("minmax_scaler", MinMaxScaler(), minmax_scale_cols)
], remainder = 'passthrough', n_jobs = -1, verbose_feature_names_out = False)

In [81]:
preprocessor

In [70]:
preprocessor.fit(df_features)
df_transformed = preprocessor.transform(df_features)

In [71]:
print(df_transformed.shape)
df_transformed[:3]

(50674, 8431)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 62 stored elements and shape (3, 8431)>

In [72]:
df_songs_raw[df_songs_raw["artist"] == "Coldplay"].head()

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
6,TRVCCWR128F9304A30,Viva la Vida,Coldplay,https://p.scdn.co/mp3-preview/ab747fed1bfab2ac...,08A1lZeyLMWH58DT6aYjnC,"rock, alternative, indie, pop, alternative_roc...",,2013,235384,0.588,0.806,8,-7.903,1,0.105,0.153,0.0,0.0634,0.52,137.973,4
8,TRMZXEW128F9341FD5,The Scientist,Coldplay,https://p.scdn.co/mp3-preview/95cb9df1b056d759...,0GSSsT9szp0rJkBrYkzy6s,"rock, alternative, indie, pop, alternative_roc...",Rock,2007,311014,0.566,0.429,5,-7.826,1,0.0242,0.715,1.4e-05,0.12,0.173,146.365,4
9,TRUJIIV12903CA8848,Clocks,Coldplay,https://p.scdn.co/mp3-preview/24c7fe858b234e3c...,0BCPKOYdS2jbQ8iyB56Zns,"rock, alternative, indie, pop, alternative_roc...",,2002,307879,0.577,0.749,5,-7.215,0,0.0279,0.599,0.0115,0.183,0.255,130.97,4
14,TRXNTMB12903CA884A,Fix You,Coldplay,https://p.scdn.co/mp3-preview/b1640815319b2df5...,0Kk4dfh9Kq7Okg4T58IBVR,"rock, alternative, indie, pop, alternative_roc...",,2016,295826,0.213,0.416,3,-8.683,1,0.0336,0.164,0.00168,0.11,0.129,138.478,4
56,TRCYFSS128F934406C,Don't Panic,Coldplay,https://p.scdn.co/mp3-preview/18d587e6cdcd4e7f...,0MbCcvzLYu3xq6OgEkSbhC,"rock, alternative, indie, pop, alternative_roc...",,1999,158426,0.62,0.6,0,-10.061,1,0.0493,0.0345,0.514,0.108,0.427,118.016,4


In [73]:
song_input_row = df_features[df_songs_raw["name"] == "The Gauntlet"]

if not song_input_row.empty:
    input_vector = preprocessor.transform(song_input_row)

In [74]:
similarities = cosine_similarity(df_transformed, input_vector)
similarities.shape

(50674, 1)

In [76]:
top_indices = np.argsort(similarities.ravel())[-11:][::-1]
df_songs_raw.iloc[top_indices].head(5)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
13983,TRMIEPT128F4297BD7,The Gauntlet,Dropkick Murphys,https://p.scdn.co/mp3-preview/6d27508da8653929...,1RkIYoHyYKR8ZlbFM3FZPx,"punk, punk_rock",,2001,167866,0.2,0.965,2,-4.69,1,0.12,0.000398,8e-06,0.0638,0.642,172.151,4
20210,TRXYOJK12903CAC768,A Few Good Men,Dropkick Murphys,https://p.scdn.co/mp3-preview/5760af29403ad796...,0u3jw4N4HKJrr0mcyrNkO5,"punk, hardcore, punk_rock",,2001,156480,0.301,0.927,2,-4.659,1,0.0516,0.00369,0.0,0.579,0.703,147.802,4
13928,TRXINZE128F92EC8C4,Never Alone,Dropkick Murphys,https://p.scdn.co/mp3-preview/2518cacd3705e458...,1gVQoSBuTgAYiXk427UiYE,"punk, punk_rock",,1998,174171,0.239,0.948,9,-3.466,1,0.0705,6.7e-05,0.0,0.15,0.868,168.425,4
40675,TRUXYQN128F934DC64,Rooftops,Alkaline Trio,https://p.scdn.co/mp3-preview/6f1abcb8d0c3bf31...,0Ul4Q6fUufEYxTUuG4Ovca,"punk, punk_rock, cover",,2007,135253,0.303,0.906,2,-5.408,1,0.0562,0.00516,2e-06,0.114,0.592,156.047,4
13965,TRKKFES128F92EEFF3,Black Velvet Band,Dropkick Murphys,https://p.scdn.co/mp3-preview/90eb4e82658eb136...,2LXjAuHyvlQl426hSjCMnD,"punk, punk_rock, cover",,2003,183306,0.237,0.945,7,-2.825,1,0.0761,0.0242,0.0,0.271,0.567,157.387,4


In [77]:
def recommend(song_name, song_data, feature_matrix, top_k=10):
    match = song_data[song_data["name"] == song_name]
    if match.empty:
        print("Song not found.")
        return
    song_idx = match.index[0]
    input_vec = feature_matrix[song_idx].reshape(1, -1)
    similarity = cosine_similarity(input_vec, feature_matrix)
    top_indices = np.argsort(similarity.ravel())[-(top_k+1):][::-1]
    top_recommendations = song_data.iloc[top_indices][['name', 'artist', 'spotify_preview_url']].reset_index(drop = True)
    return top_recommendations

In [78]:
recommend("The Gauntlet", song_data=df_songs_raw, feature_matrix = df_transformed, top_k = 10)

Unnamed: 0,name,artist,spotify_preview_url
0,Sell Yourself,Cage the Elephant,https://p.scdn.co/mp3-preview/f4cc484e715a22ae...
1,Drones In The Valley,Cage the Elephant,https://p.scdn.co/mp3-preview/1c3129b069e2ff6b...
2,Cash Cow,We Are Scientists,https://p.scdn.co/mp3-preview/c0d35dc1927a5d06...
3,The Holiday Song,Pixies,https://p.scdn.co/mp3-preview/e70b6a03080d9e7f...
4,Free Love,Cage the Elephant,https://p.scdn.co/mp3-preview/c23d2c9f53322561...
5,Sticks And Stones,Alien Ant Farm,https://p.scdn.co/mp3-preview/2d17ce3c54d25aca...
6,Here It Goes Again,OK Go,https://p.scdn.co/mp3-preview/7327e3ac55e46020...
7,Fear Of Falling,Maxïmo Park,https://p.scdn.co/mp3-preview/5df5e8d762b9a43a...
8,Pennsylvania,Bloodhound Gang,https://p.scdn.co/mp3-preview/39030ccfebe6811c...
9,Minority,Green Day,https://p.scdn.co/mp3-preview/eda685fb47cabb7b...
