In [5]:
# for data handling
import pandas as pd
import numpy as np

# for visualisation
import plotly
import plotly.express as px
import plotly.io as pio

#Progreebar
from tqdm import tqdm

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# for transformations and predictions
from scipy.optimize import curve_fit
# from yellowbrick.target import FeatureCorrelation
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# For scoring
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score,mean_absolute_error

# For validation
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv("../spotify15k.csv")

In [7]:
df.columns

Index(['Unnamed: 0.1', 'album', 'artist_name', 'track_number', 'id', 'name',
       'uri', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'explicit',
       'mode', 'popularity', 'duration_ms', 'Unnamed: 0'],
      dtype='object')

In [8]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
# remove the square brackets from the artists

df["artist_name"]=df["artist_name"].str.replace("[", "")
df["artist_name"]=df["artist_name"].str.replace("]", "")
df["artist_name"]=df["artist_name"].str.replace("'", "")

In [10]:
df.drop("track_number", axis=1, inplace=True)

In [11]:
# normalise the columns in the dataframe

def normalize_column(col):
    max_d = df[col].max()
    min_d = df[col].min()
    df[col] = (df[col] - min_d)/(max_d - min_d)

In [12]:
# normalize all of numerical columns so that min value is 0 and max value is 1

num_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num = df.select_dtypes(include=num_types)

for col in num.columns:
    normalize_column(col)

In [13]:
#perform Kmeans Clustering

km = KMeans(n_clusters=25)
pred = km.fit_predict(num)
df['pred'] = pred
normalize_column('pred')

In [14]:
# neighbourhood based collborative filterng recommendation system using similarity metrics
# manhattan distance is calculated for all songs and recommend songs that are similar to it, based on any given song

class recommendSongs():
    
    def __init__(self, data):
        self.data_ = data
    
    #function which returns recommendations, we can also choose the amount of songs to be recommended
    def get_recommendations(self, song_name, n_top):
        distances = []
        #choosing the given song_name and dropping it from the data
        song = self.data_[(self.data_.name.str.lower() == song_name.lower())].head(1).values[0]
        remData = self.data_[self.data_.name.str.lower() != song_name.lower()]
        for recSong in tqdm(remData.values):
            dist = 0
            for col in np.arange(len(remData.columns)):
                #indices of non-numerical columns(id, uri, name, artists, album)
                if not col in [0,1,2,3,14]:
                    #calculating the manhettan distances for each numerical feature
                    dist = dist + np.absolute(float(song[col]) - float(recSong[col]))
            distances.append(dist)
        remData['distance'] = distances
        #sorting our data to be ascending by 'distance' feature
        remData = remData.sort_values('distance')
        columns = ['artist_name', 'name']
        return remData[columns][:n_top]

In [15]:
#Instantiate recommender class
recommender = recommendSongs(df)

In [16]:
#Get recommendations 'Locked Out of Heaven' song
recommender.get_recommendations(song_name='Locked Out of Heaven', n_top=5)

  0%|          | 0/15077 [00:00<?, ?it/s]


ValueError: could not convert string to float: 'Locked out of Heaven'

In [17]:
#Get recommendations 'That's What I Like' song
recommender.get_recommendations(song_name="That's What I Like", n_top=7)

  0%|          | 0/15077 [00:00<?, ?it/s]


ValueError: could not convert string to float: "That's What I Like"