### Importing data

In [1]:
!which python
!python --version

/Users/anjal/.pyenv/shims/python
Python 3.10.12


In [2]:
import pandas as pd

df = pd.read_csv('SpotifyFeatures.csv')

print(df.size)

print(df.shape)
df.head()

4189050
(232725, 18)


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


### EDA

In [3]:
df.columns

Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

In [4]:
null_counts = df.isnull().sum()

# Check for empty strings (only for object/string type columns)
empty_counts = (df.select_dtypes(include='object') == '').sum()

# Combine both results
missing_summary = pd.DataFrame({
    'Null Values': null_counts,
    'Empty Strings': empty_counts
}).fillna(0).astype(int)

# Show only columns with missing data
missing_summary = missing_summary[(missing_summary['Null Values'] > 0) | (missing_summary['Empty Strings'] > 0)]

print(missing_summary)

            Null Values  Empty Strings
track_name            1              0


In [5]:
df['track_name'] = df['track_name'].fillna('N/A')

In [6]:
key_map = {
    'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5,
    'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11
}
df['key'] = df['key'].map(key_map)

In [7]:
df['mode'] = df['mode'].map({'Minor': 0, 'Major': 1})

In [8]:
df['time_signature'] = df['time_signature'].str.extract(r'(\d+)').astype(float)

### Features

In [9]:
feature_columns = [
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
    'speechiness', 'tempo', 'time_signature', 'valence'
]

# bias recommendations toward popular songs
feature_columns.append('popularity')

df[feature_columns].head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,0.611,0.389,99373,0.91,0.0,1,0.346,-1.828,1,0.0525,166.969,4.0,0.814,0
1,0.246,0.59,137373,0.737,0.0,6,0.151,-5.559,0,0.0868,174.003,4.0,0.816,1
2,0.952,0.663,170267,0.131,0.0,0,0.103,-13.879,0,0.0362,99.488,5.0,0.368,3
3,0.703,0.24,152427,0.326,0.0,1,0.0985,-12.178,1,0.0395,171.758,4.0,0.227,0
4,0.95,0.331,82625,0.225,0.123,5,0.202,-21.15,1,0.0456,140.576,4.0,0.39,4


### Preprocessing Data

In [10]:
from sklearn.preprocessing import StandardScaler

X = df[feature_columns]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Training KNN

In [11]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(X_scaled)

### Training Autoencoder

In [59]:
def get_recommendations(track_ids, top_n=5):
    if isinstance(track_ids, str):
        track_ids = [track_ids]  # handle single string input

    # Filter valid track IDs and get their indices
    valid_ids = [tid for tid in track_ids if tid in df['track_id'].values]
    if not valid_ids:
        print("No valid track IDs found.")
        return []

    indices = df[df['track_id'].isin(valid_ids)].index.tolist()
    query_vectors = X_scaled[indices]

    # Compute distances to all other songs
    distances, indices = knn.kneighbors(query_vectors, n_neighbors=top_n + 10)

    seen_track_ids = set(valid_ids)  # avoid duplicates and self-matches
    recommendations = []

    for row_dists, row_indices in zip(distances, indices):
        for dist, i in zip(row_dists, row_indices):
            song = df.iloc[i]
            song_id = song['track_id']

            if song_id in seen_track_ids:
                continue

            recommendations.append({
                'track_id': song_id,
                'track_name': song['track_name'],
                'artist_name': song['artist_name'],
                'genre': song['genre'],
                'distance': round(float(dist), 4)
            })
            seen_track_ids.add(song_id)

            if len(recommendations) >= top_n:
                break
        if len(recommendations) >= top_n:
            break

    # Compute similarity %
    for r in recommendations:
        r['similarity'] = round((1 - r['distance']) * 100, 1)

    # Sort by distance (ascending)
    recommendations.sort(key=lambda x: x['distance'])

    return recommendations

### Testing the model

In [60]:
sample_id = df['track_id'].iloc[1104]# or any valid track_id from your dataset
sample_id1 = df['track_id'].iloc[109539]# or any valid track_id from your dataset

print("Selected:", df[df['track_id'] == sample_id][['track_name', 'artist_name']].values[0])
print("Selected:", df[df['track_id'] == sample_id1][['track_name', 'artist_name']].values[0])
    

Selected: ['In the End' 'Linkin Park']
Selected: ['Hymn For The Weekend - Seeb Remix' 'Coldplay']


In [61]:
recommendations = get_recommendations([sample_id,sample_id1], top_n=10)

for rec in recommendations:
    print(f"{rec['track_id']}-{rec['track_name']} by {rec['artist_name']} | Genre: {rec['genre']} | Similarity: {rec['similarity']}%")

45OfR7ugJMgbFDuNOVpIq3-Party On The West Coast (feat. Snoop Dogg) by Matoma | Genre: Dance | Similarity: 97.8%
0xq4ZTcmwBfkPGo4RRKmMe-Gotta Go by CHUNG HA | Genre: Pop | Similarity: 97.4%
61uyGDPJ06MkxJtHgPmuyO-Company by Justin Bieber | Genre: Pop | Similarity: 96.1%
0g5EKLgdKvNlln7TNqBByK-Middle by DJ Snake | Genre: Pop | Similarity: 95.4%
4P5KoWXOxwuobLmHXLMobV-Come As You Are by Nirvana | Genre: Rock | Similarity: 95.3%
6x5deYIe42rgRbStECDjYQ-Infra-Red by Three Days Grace | Genre: Rock | Similarity: 95.2%
4vb4mFvYsr2h6enhjJsq9Y-Water Under the Bridge by Adele | Genre: Soul | Similarity: 95.2%
33IOhptvC2Qoy2UhjiHXLV-Flames by David Guetta | Genre: Dance | Similarity: 95.1%
7IWTIkiWGWNQyYfOLdMrGD-Burn Out (feat. Dewain Whitmore) by Martin Garrix | Genre: Pop | Similarity: 94.8%
5J4ZkQpzMUFojo1CtAZYpn-Love Me Harder by Ariana Grande | Genre: Pop | Similarity: 94.3%


In [57]:
import joblib
import numpy as np

joblib.dump(knn, "model/knn_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")
df.to_csv("data/songs.csv", index=False)
np.save("data/X_scaled.npy", X_scaled)