In [1]:
# for data handling
import pandas as pd
import numpy as np

# for visualisation
import plotly
import plotly.express as px
import plotly.io as pio

#Progreebar
from tqdm import tqdm

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# for transformations and predictions
from scipy.optimize import curve_fit
# from yellowbrick.target import FeatureCorrelation
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# For scoring
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score,mean_absolute_error

# For validation
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../spotify15k.csv")

In [3]:
df.columns

Index(['Unnamed: 0.1', 'album', 'artist_name', 'track_number', 'id', 'name',
       'uri', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'explicit',
       'mode', 'popularity', 'duration_ms', 'Unnamed: 0'],
      dtype='object')

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df

Unnamed: 0,Unnamed: 0.1,album,artist_name,track_number,id,name,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,explicit,mode,popularity,duration_ms
0,0,El Dorado,24kGoldn,1,38mQZ5tZ6IylQaJCCF90ox,The Top,spotify:track:38mQZ5tZ6IylQaJCCF90ox,0.33600,0.754,0.711,0.000,0.1280,-3.842,0.0720,138.031,0.2700,True,0,59,196373
1,1,Fighting Demons (Deluxe),Juice WRLD,1,1X8E4vVoOM3BpSQlEDSjjM,Burn,spotify:track:1X8E4vVoOM3BpSQlEDSjjM,0.14900,0.354,0.475,0.034,0.2100,-9.333,0.0425,114.975,0.0393,True,0,76,217222
2,2,El Dorado,24kGoldn,2,3JVTsvTldB6arHVlmxmTnM,Company (feat. Future),spotify:track:3JVTsvTldB6arHVlmxmTnM,0.44700,0.831,0.727,0.000,0.1110,-3.657,0.1690,125.068,0.7030,True,0,71,213080
3,3,Fighting Demons (Deluxe),Juice WRLD,2,1hB3M3POeKMLxcEFEvPeqU,Already Dead,spotify:track:1hB3M3POeKMLxcEFEvPeqU,0.02160,0.744,0.495,0.000,0.1110,-6.311,0.0634,83.510,0.1280,True,1,77,231221
4,4,Fighting Demons (Deluxe),Juice WRLD,3,1CfuBY3BDdKpooQ9L5zgUc,Cigarettes,spotify:track:1CfuBY3BDdKpooQ9L5zgUc,0.03060,0.587,0.614,0.000,0.3990,-6.344,0.0435,160.075,0.4510,True,1,79,227527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15073,5229,Whole Thang,UnoTheActivist,15,3V1jkSW6iMxGyBNOGIRv2t,Spiced,spotify:track:3V1jkSW6iMxGyBNOGIRv2t,0.01100,0.766,0.505,0.000,0.0757,-9.546,0.3330,139.882,0.1140,True,1,20,234312
15074,5230,Whole Thang,UnoTheActivist,16,7fXIDvSUmcEWknAHcaTsUf,Giuseppe Swag,spotify:track:7fXIDvSUmcEWknAHcaTsUf,0.03570,0.753,0.639,0.000,0.6440,-4.848,0.0689,124.002,0.3720,True,0,21,220395
15075,5231,Whole Thang,UnoTheActivist,17,4XEcyL5e89TGez5I4RkPal,Unemployed,spotify:track:4XEcyL5e89TGez5I4RkPal,0.47400,0.822,0.702,0.000,0.0921,-5.321,0.1130,130.008,0.2880,True,0,19,185913
15076,5232,Whole Thang,UnoTheActivist,18,1s2xgFFCKjHMi7lOKUBAlk,Every Since,spotify:track:1s2xgFFCKjHMi7lOKUBAlk,0.03400,0.707,0.849,0.000,0.1550,-7.166,0.2210,140.172,0.7530,True,0,39,260598


In [6]:
df.drop('Unnamed: 0.1', axis=1, inplace=True)

In [7]:
# remove the square brackets from the artists

df["artist_name"]=df["artist_name"].str.replace("[", "")
df["artist_name"]=df["artist_name"].str.replace("]", "")
df["artist_name"]=df["artist_name"].str.replace("'", "")

In [8]:
df.drop("track_number", axis=1, inplace=True)

In [9]:
# normalise the columns in the dataframe

def normalize_column(col):
    max_d = df[col].max()
    min_d = df[col].min()
    df[col] = (df[col] - min_d)/(max_d - min_d)

In [10]:
# normalize all of numerical columns so that min value is 0 and max value is 1

num_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num = df.select_dtypes(include=num_types)

for col in num.columns:
    normalize_column(col)

In [11]:
#perform Kmeans Clustering

km = KMeans(n_clusters=25)
pred = km.fit_predict(num)
df['pred'] = pred
normalize_column('pred')

In [12]:
# neighbourhood based collborative filterng recommendation system using similarity metrics
# manhattan distance is calculated for all songs and recommend songs that are similar to it, based on any given song

class recommendSongs():
    
    def __init__(self, data):
        self.data_ = data
    
    #function which returns recommendations, we can also choose the amount of songs to be recommended
    def get_recommendations(self, song_name, n_top):
        distances = []
        #choosing the given song_name and dropping it from the data
        song = self.data_[(self.data_.name.str.lower() == song_name.lower())].head(1).values[0]
        remData = self.data_[self.data_.name.str.lower() != song_name.lower()]
        for recSong in tqdm(remData.values):
            dist = 0
            for col in np.arange(len(remData.columns)):
                #indices of non-numerical columns(id, uri, name, artists, album)
                if not col in [0,1,2,3,4,14]:
                    #calculating the manhettan distances for each numerical feature
                    dist = dist + np.absolute(float(song[col]) - float(recSong[col]))
            distances.append(dist)
        remData['distance'] = distances
        #sorting our data to be ascending by 'distance' feature
        remData = remData.sort_values('distance')
        columns = ['artist_name', 'name']
        return remData[columns][:n_top]

In [13]:
#Instantiate recommender class
recommender = recommendSongs(df)

In [14]:
#Get recommendations 'Locked Out of Heaven' song
recommender.get_recommendations(song_name='Locked Out of Heaven', n_top=5)

100%|██████████| 15077/15077 [00:00<00:00, 82211.10it/s]


Unnamed: 0,artist_name,name
13213,Twenty One Pilots,Screen
1769,Maroon 5,Sugar
8287,Katy Perry,Peacock
8850,Shakira,Te Dejo Madrid
13047,Usher,Lil Freak (feat. Nicki Minaj)


In [15]:
#Get recommendations 'That's What I Like' song
recommender.get_recommendations(song_name="That's What I Like", n_top=7)

100%|██████████| 15077/15077 [00:00<00:00, 71937.74it/s]


Unnamed: 0,artist_name,name
4499,Kesha,Grow A Pear
12332,Lucky Daye,Feels Like
5955,Justin Bieber,Intentions (feat. Quavo)
3970,21 Savage,a&t
5942,Justin Bieber,Die For You (feat. Dominic Fike)
14079,Mark Ronson,Uptown Funk (feat. Bruno Mars)
12572,Bad Bunny,200 Mph
