In [1]:
# for data handling
import pandas as pd
import numpy as np

# for visualisation
import plotly
import plotly.express as px
import plotly.io as pio

#Progreebar
from tqdm import tqdm

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# for transformations and predictions
from scipy.optimize import curve_fit
# from yellowbrick.target import FeatureCorrelation
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# For scoring
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score,mean_absolute_error

# For validation
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("spotify14k.csv")

In [3]:
df[0:3]

Unnamed: 0.1,Unnamed: 0,album,track_number,id,name,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,artist_name
0,0,SAVAGE MODE II [CHOPPED NOT SLOPPED],1,5O29nkYY8aByZY6X2TD7eO,Purple Savage Mode II Intro [ChopNotSlop Remix],spotify:track:5O29nkYY8aByZY6X2TD7eO,0.243,0.601,0.664,6.6e-05,0.326,-10.427,0.505,123.964,0.2,41,21 Savage
1,1,SAVAGE MODE II [CHOPPED NOT SLOPPED],2,5pMU8APwU6k09iRcmSYNpV,Many Men [ChopNotSlop Remix],spotify:track:5pMU8APwU6k09iRcmSYNpV,0.00171,0.735,0.514,0.28,0.531,-10.561,0.0731,123.755,0.134,43,21 Savage
2,2,SAVAGE MODE II [CHOPPED NOT SLOPPED],3,3anTHzyskaQVkWi8UxSTgj,Runnin [ChopNotSlop Remix],spotify:track:3anTHzyskaQVkWi8UxSTgj,0.0084,0.891,0.386,0.00317,0.159,-10.952,0.291,115.222,0.111,42,21 Savage


In [4]:
df.columns

Index(['Unnamed: 0', 'album', 'track_number', 'id', 'name', 'uri',
       'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'popularity',
       'artist_name'],
      dtype='object')

In [5]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
df[0:3]

Unnamed: 0,album,track_number,id,name,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,artist_name
0,SAVAGE MODE II [CHOPPED NOT SLOPPED],1,5O29nkYY8aByZY6X2TD7eO,Purple Savage Mode II Intro [ChopNotSlop Remix],spotify:track:5O29nkYY8aByZY6X2TD7eO,0.243,0.601,0.664,6.6e-05,0.326,-10.427,0.505,123.964,0.2,41,21 Savage
1,SAVAGE MODE II [CHOPPED NOT SLOPPED],2,5pMU8APwU6k09iRcmSYNpV,Many Men [ChopNotSlop Remix],spotify:track:5pMU8APwU6k09iRcmSYNpV,0.00171,0.735,0.514,0.28,0.531,-10.561,0.0731,123.755,0.134,43,21 Savage
2,SAVAGE MODE II [CHOPPED NOT SLOPPED],3,3anTHzyskaQVkWi8UxSTgj,Runnin [ChopNotSlop Remix],spotify:track:3anTHzyskaQVkWi8UxSTgj,0.0084,0.891,0.386,0.00317,0.159,-10.952,0.291,115.222,0.111,42,21 Savage


In [17]:
# visualising every column

In [18]:
# most popular tracks

In [19]:
# most popular artists

In [20]:
# feature selection

In [7]:
# remove the square brackets from the artists

df["artist_name"]=df["artist_name"].str.replace("[", "")
df["artist_name"]=df["artist_name"].str.replace("]", "")
df["artist_name"]=df["artist_name"].str.replace("'", "")

In [8]:
df[400:403]

Unnamed: 0,album,track_number,id,name,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,artist_name
400,The Bigger Artist,9,7564ofEBD5mEiIkTclAW1J,Somebody (feat. Don Q),spotify:track:7564ofEBD5mEiIkTclAW1J,0.0982,0.607,0.732,0.0,0.16,-6.779,0.336,100.42,0.687,47,A Boogie Wit Da Hoodie
401,The Bigger Artist,10,1QCWxlNQcpv6LO3gjd11AR,Money Sprung (feat. Don Q),spotify:track:1QCWxlNQcpv6LO3gjd11AR,0.373,0.827,0.556,4e-06,0.0719,-6.237,0.319,130.097,0.34,44,A Boogie Wit Da Hoodie
402,The Bigger Artist,11,6KdA9pz4KS4avq0CUqblAV,If I Gotta Go,spotify:track:6KdA9pz4KS4avq0CUqblAV,0.19,0.664,0.542,0.0,0.324,-6.917,0.27,140.865,0.0852,49,A Boogie Wit Da Hoodie


In [9]:
df.drop("track_number", axis=1, inplace=True)
df[0:3]

Unnamed: 0,album,id,name,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,artist_name
0,SAVAGE MODE II [CHOPPED NOT SLOPPED],5O29nkYY8aByZY6X2TD7eO,Purple Savage Mode II Intro [ChopNotSlop Remix],spotify:track:5O29nkYY8aByZY6X2TD7eO,0.243,0.601,0.664,6.6e-05,0.326,-10.427,0.505,123.964,0.2,41,21 Savage
1,SAVAGE MODE II [CHOPPED NOT SLOPPED],5pMU8APwU6k09iRcmSYNpV,Many Men [ChopNotSlop Remix],spotify:track:5pMU8APwU6k09iRcmSYNpV,0.00171,0.735,0.514,0.28,0.531,-10.561,0.0731,123.755,0.134,43,21 Savage
2,SAVAGE MODE II [CHOPPED NOT SLOPPED],3anTHzyskaQVkWi8UxSTgj,Runnin [ChopNotSlop Remix],spotify:track:3anTHzyskaQVkWi8UxSTgj,0.0084,0.891,0.386,0.00317,0.159,-10.952,0.291,115.222,0.111,42,21 Savage


In [10]:
# normalise the columns in the dataframe

def normalize_column(col):
    max_d = df[col].max()
    min_d = df[col].min()
    df[col] = (df[col] - min_d)/(max_d - min_d)

In [11]:
# normalize all of numerical columns so that min value is 0 and max value is 1

num_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num = df.select_dtypes(include=num_types)

for col in num.columns:
    normalize_column(col)

In [58]:
#perform Kmeans Clustering

km = KMeans(n_clusters=25)
pred = km.fit_predict(num)
df['pred'] = pred
normalize_column('pred')

In [12]:
# neighbourhood based collborative filterng recommendation system using similarity metrics
# manhattan distance is calculated for all songs and recommend songs that are similar to it, based on any given song

class recommendSongs():
    
    def __init__(self, data):
        self.data_ = data
    
    #function which returns recommendations, we can also choose the amount of songs to be recommended
    def get_recommendations(self, song_name, n_top):
        distances = []
        #choosing the given song_name and dropping it from the data
        song = self.data_[(self.data_.name.str.lower() == song_name.lower())].head(1).values[0]
        remData = self.data_[self.data_.name.str.lower() != song_name.lower()]
        for recSong in tqdm(remData.values):
            dist = 0
            for col in np.arange(len(remData.columns)):
                #indices of non-numerical columns(id, uri, name, artists, album)
                if not col in [0,1,2,3,14]:
                    #calculating the manhettan distances for each numerical feature
                    dist = dist + np.absolute(float(song[col]) - float(recSong[col]))
            distances.append(dist)
        remData['distance'] = distances
        #sorting our data to be ascending by 'distance' feature
        remData = remData.sort_values('distance')
        columns = ['artist_name', 'name']
        return remData[columns][:n_top]

In [13]:
#Instantiate recommender class
recommender = recommendSongs(df)

In [14]:
#Get recommendations 'Locked Out of Heaven' song
recommender.get_recommendations(song_name='Locked Out of Heaven', n_top=5)

100%|█████████████████████████████████████████████████████████████████████████| 14590/14590 [00:00<00:00, 31968.63it/s]


Unnamed: 0,artist_name,name
8541,Kesha,Die Young
8261,Katy Perry,The One That Got Away
3218,Britney Spears,Toxic
3296,Bruno Mars,Treasure
9318,Lil Nas X,INDUSTRY BABY (feat. Jack Harlow)


In [15]:
#Get recommendations 'That's What I Like' song
recommender.get_recommendations(song_name="That's What I Like", n_top=7)

100%|█████████████████████████████████████████████████████████████████████████| 14590/14590 [00:00<00:00, 33382.23it/s]


Unnamed: 0,artist_name,name
12138,Shawn Mendes,Señorita
2216,Bad Bunny,200 Mph
11593,Rihanna,Consideration
12635,Taylor Swift,Paper Rings
2183,Bad Bunny,BENDICIONES
8575,Kesha,Your Love Is My Drug
3548,BTS,Filter


In [16]:
#Get recommendations 'idfc' song
recommender.get_recommendations(song_name="idfc", n_top=3)

100%|█████████████████████████████████████████████████████████████████████████| 14590/14590 [00:00<00:00, 33153.75it/s]


Unnamed: 0,artist_name,name
10365,Maroon 5,Seasons
8622,Khalid,Saturday Nights
14310,Usher,Climax
