## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

## Load Dataset


In [2]:
from google.colab import drive
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


In [3]:
root = "/content/gdrive/My Drive/kaggle/spotify-dataset/"
# path = "/content/gdrive/My Drive/genres_v2.csv"
genres = pd.read_csv(root+"genres_v2.csv")
print(genres.shape)
genres.head()

(42305, 22)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre,song_name,Unnamed: 0,title
0,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,156.985,audio_features,2Vc6NJ9PW9gD9q343XFRKx,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,https://api.spotify.com/v1/audio-analysis/2Vc6...,124539,4,Dark Trap,Mercury: Retrograde,,
1,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,115.08,audio_features,7pgJBLVz5VmnL7uGHmRj6p,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,https://api.spotify.com/v1/audio-analysis/7pgJ...,224427,4,Dark Trap,Pathology,,
2,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,218.05,audio_features,0vSWgAlfpye0WCGeNmuNhy,spotify:track:0vSWgAlfpye0WCGeNmuNhy,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,https://api.spotify.com/v1/audio-analysis/0vSW...,98821,4,Dark Trap,Symbiote,,
3,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,186.948,audio_features,0VSXnJqQkwuH2ei1nOQ1nu,spotify:track:0VSXnJqQkwuH2ei1nOQ1nu,https://api.spotify.com/v1/tracks/0VSXnJqQkwuH...,https://api.spotify.com/v1/audio-analysis/0VSX...,123661,3,Dark Trap,ProductOfDrugs (Prod. The Virus and Antidote),,
4,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,0.591,147.988,audio_features,4jCeguq9rMTlbMmPHuO7S3,spotify:track:4jCeguq9rMTlbMmPHuO7S3,https://api.spotify.com/v1/tracks/4jCeguq9rMTl...,https://api.spotify.com/v1/audio-analysis/4jCe...,123298,4,Dark Trap,Venom,,


In [4]:
genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42305 entries, 0 to 42304
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      42305 non-null  float64
 1   energy            42305 non-null  float64
 2   key               42305 non-null  int64  
 3   loudness          42305 non-null  float64
 4   mode              42305 non-null  int64  
 5   speechiness       42305 non-null  float64
 6   acousticness      42305 non-null  float64
 7   instrumentalness  42305 non-null  float64
 8   liveness          42305 non-null  float64
 9   valence           42305 non-null  float64
 10  tempo             42305 non-null  float64
 11  type              42305 non-null  object 
 12  id                42305 non-null  object 
 13  uri               42305 non-null  object 
 14  track_href        42305 non-null  object 
 15  analysis_url      42305 non-null  object 
 16  duration_ms       42305 non-null  int64 

## Drop Duplicate Songs

In [6]:
genres['id'].unique().shape
genres.drop_duplicates(['id'], inplace=True)
print(genres.shape)

(35877, 22)


## Remove unnecessary features
1. **URI :** URI is the unique identifier spotify assigns to every track, artist and album
2. **type :** is filled with only one entry audi_features
3. **track_href** and **analysis_url** are urls and unimportant in analysis
4. **title**
5. **Unnamed: 0**
6. **id**

In [8]:
genres.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'genre', 'song_name', 'Unnamed: 0', 'title'],
      dtype='object')

In [9]:
columns_to_drop = ['type', 'id', 'uri', 'track_href', 'analysis_url', 'Unnamed: 0', 'title']
genres.drop(columns_to_drop, axis=1, inplace=True)
print(genres.shape)

(35877, 15)


In [10]:
genres

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre,song_name
0,0.831,0.814,2,-7.364,1,0.4200,0.059800,0.013400,0.0556,0.3890,156.985,124539,4,Dark Trap,Mercury: Retrograde
1,0.719,0.493,8,-7.230,1,0.0794,0.401000,0.000000,0.1180,0.1240,115.080,224427,4,Dark Trap,Pathology
2,0.850,0.893,5,-4.783,1,0.0623,0.013800,0.000004,0.3720,0.0391,218.050,98821,4,Dark Trap,Symbiote
3,0.476,0.781,0,-4.710,1,0.1030,0.023700,0.000000,0.1140,0.1750,186.948,123661,3,Dark Trap,ProductOfDrugs (Prod. The Virus and Antidote)
4,0.798,0.624,2,-7.668,1,0.2930,0.217000,0.000000,0.1660,0.5910,147.988,123298,4,Dark Trap,Venom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42298,0.502,0.991,5,-4.333,0,0.2400,0.005540,0.000198,0.4700,0.0797,150.029,200000,4,hardstyle,
42300,0.528,0.693,4,-5.148,1,0.0304,0.031500,0.000345,0.1210,0.3940,150.013,269208,4,hardstyle,
42302,0.361,0.821,8,-3.102,1,0.0505,0.026000,0.000242,0.3850,0.1240,154.935,234823,4,hardstyle,
42303,0.477,0.921,6,-4.777,0,0.0392,0.000551,0.029600,0.0575,0.4880,150.042,323200,4,hardstyle,


### Encode Genre
Genre is a categorical feature, we need to encode it to make it numeric

In [13]:
unique_genres = genres['genre'].unique()
print(unique_genres)
n = unique_genres.shape[0]
genres_map = {k:v for k,v in zip(unique_genres, range(n))}
print(genres_map)

['Dark Trap' 'Underground Rap' 'Trap Metal' 'Emo' 'Rap' 'RnB' 'Pop'
 'Hiphop' 'techhouse' 'techno' 'trance' 'psytrance' 'trap' 'dnb'
 'hardstyle']
{'Dark Trap': 0, 'Underground Rap': 1, 'Trap Metal': 2, 'Emo': 3, 'Rap': 4, 'RnB': 5, 'Pop': 6, 'Hiphop': 7, 'techhouse': 8, 'techno': 9, 'trance': 10, 'psytrance': 11, 'trap': 12, 'dnb': 13, 'hardstyle': 14}


In [14]:
data = genres.replace({'genre':genres_map}, inplace=False)
data['genre']

0         0
1         0
2         0
3         0
4         0
         ..
42298    14
42300    14
42302    14
42303    14
42304    14
Name: genre, Length: 35877, dtype: int64

In [15]:
data

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre,song_name
0,0.831,0.814,2,-7.364,1,0.4200,0.059800,0.013400,0.0556,0.3890,156.985,124539,4,0,Mercury: Retrograde
1,0.719,0.493,8,-7.230,1,0.0794,0.401000,0.000000,0.1180,0.1240,115.080,224427,4,0,Pathology
2,0.850,0.893,5,-4.783,1,0.0623,0.013800,0.000004,0.3720,0.0391,218.050,98821,4,0,Symbiote
3,0.476,0.781,0,-4.710,1,0.1030,0.023700,0.000000,0.1140,0.1750,186.948,123661,3,0,ProductOfDrugs (Prod. The Virus and Antidote)
4,0.798,0.624,2,-7.668,1,0.2930,0.217000,0.000000,0.1660,0.5910,147.988,123298,4,0,Venom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42298,0.502,0.991,5,-4.333,0,0.2400,0.005540,0.000198,0.4700,0.0797,150.029,200000,4,14,
42300,0.528,0.693,4,-5.148,1,0.0304,0.031500,0.000345,0.1210,0.3940,150.013,269208,4,14,
42302,0.361,0.821,8,-3.102,1,0.0505,0.026000,0.000242,0.3850,0.1240,154.935,234823,4,14,
42303,0.477,0.921,6,-4.777,0,0.0392,0.000551,0.029600,0.0575,0.4880,150.042,323200,4,14,


## Drop Song name Column
- Data should contain all numerical features

In [17]:
song_name = data['song_name']
print(song_name.shape)
data.drop(['song_name'], axis=1, inplace=True)
print(data.shape)
data.head()

(35877,)
(35877, 14)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre
0,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,156.985,124539,4,0
1,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,115.08,224427,4,0
2,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,218.05,98821,4,0
3,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,186.948,123661,3,0
4,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,0.591,147.988,123298,4,0


In [18]:
song_name

0                                  Mercury: Retrograde
1                                            Pathology
2                                             Symbiote
3        ProductOfDrugs (Prod. The Virus and Antidote)
4                                                Venom
                             ...                      
42298                                              NaN
42300                                              NaN
42302                                              NaN
42303                                              NaN
42304                                              NaN
Name: song_name, Length: 35877, dtype: object

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35877 entries, 0 to 42304
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      35877 non-null  float64
 1   energy            35877 non-null  float64
 2   key               35877 non-null  int64  
 3   loudness          35877 non-null  float64
 4   mode              35877 non-null  int64  
 5   speechiness       35877 non-null  float64
 6   acousticness      35877 non-null  float64
 7   instrumentalness  35877 non-null  float64
 8   liveness          35877 non-null  float64
 9   valence           35877 non-null  float64
 10  tempo             35877 non-null  float64
 11  duration_ms       35877 non-null  int64  
 12  time_signature    35877 non-null  int64  
 13  genre             35877 non-null  int64  
dtypes: float64(9), int64(5)
memory usage: 5.4 MB


## Normalize the data
- Check for nulls before normalizing


In [20]:
data.isnull().sum()

danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
genre               0
dtype: int64

In [21]:
scaler = MinMaxScaler()

In [22]:
scaler.fit(data.values)
columns = data.columns
data = pd.DataFrame(scaler.transform(data.values), columns=columns)
print(data.head())
print(data.tail())

   danceability    energy       key  ...  duration_ms  time_signature  genre
0      0.829884  0.813955  0.181818  ...     0.111487            0.75    0.0
1      0.708527  0.492877  0.727273  ...     0.224043            0.75    0.0
2      0.850471  0.892974  0.454545  ...     0.082507            0.75    0.0
3      0.445227  0.780947  0.000000  ...     0.110497            0.50    0.0
4      0.794127  0.623909  0.181818  ...     0.110088            0.75    0.0

[5 rows x 14 columns]
       danceability    energy       key  ...  duration_ms  time_signature  genre
35872      0.473399  0.990998  0.454545  ...     0.196518            0.75    1.0
35873      0.501571  0.692925  0.363636  ...     0.274503            0.75    1.0
35874      0.320620  0.820956  0.727273  ...     0.235757            0.75    1.0
35875      0.446311  0.920981  0.545455  ...     0.335342            0.75    1.0
35876      0.502655  0.944987  0.818182  ...     0.153880            0.75    1.0

[5 rows x 14 columns]


In [24]:
data.shape

(35877, 14)

In [26]:
print(data.min(axis=0))

danceability        0.0
energy              0.0
key                 0.0
loudness            0.0
mode                0.0
speechiness         0.0
acousticness        0.0
instrumentalness    0.0
liveness            0.0
valence             0.0
tempo               0.0
duration_ms         0.0
time_signature      0.0
genre               0.0
dtype: float64


In [27]:
data.max(axis=0)

danceability        1.0
energy              1.0
key                 1.0
loudness            1.0
mode                1.0
speechiness         1.0
acousticness        1.0
instrumentalness    1.0
liveness            1.0
valence             1.0
tempo               1.0
duration_ms         1.0
time_signature      1.0
genre               1.0
dtype: float64

## Recommend songs based on a given song


In [36]:
def recommend_song(query):
  """
  Arguments:
  query: song embedding/ vector based on which similar songs will be recommended.
  Returns:
  None 
  Prints top 10 similar songs
  """
  
  similarity = data.dot(query).nlargest(10)
  for i,ix in enumerate(similarity.index):
    print(f"{i+1}.", song_name[ix], round(similarity[ix],3))

In [40]:
i = 7
print("="*30)
print("Query:", song_name[i])
print("="*30)
query = data.iloc[i,:]
recommend_song(query)

Query: T.R.U. (Totally Rotten Underground)
1. Good Day 4.644
2. Big Drip (feat. Lil Baby & Quavo) - Remix 4.634
3. Falling for You 4.623
4. I Just Had Sex 4.602
5. Big Checks (feat. YG) 4.592
6. Every Step Every Way 4.581
7. IN THIS SONG EVERYONE DIES 4.579
8. Do You Remember 4.576
9. Salute (feat. French Montana) 4.571
10. Why 4.57
