In [2]:
# pip install spotipy

In [2]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from skimage import io
from sklearn.decomposition import PCA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Open the YAML file that contains the Spotify API credentials.
stream= open("/content/drive/MyDrive/Spotify/Spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [None]:
dtypes = {'track_uri': 'object', 'artist_uri': 'object', 'album_uri': 'object', 'danceability': 'float16', 'energy': 'float16', 'key': 'float16',
               'loudness': 'float16', 'mode': 'float16', 'speechiness': 'float16', 'acousticness': 'float16', 'instrumentalness': 'float16',
               'liveness': 'float16', 'valence': 'float16', 'tempo': 'float16', 'duration_ms': 'float32', 'time_signature': 'float16',
               'Track_release_date': 'int8', 'Track_pop': 'int8', 'Artist_pop': 'int8', 'Artist_genres': 'object'}
try:
    df=pd.read_csv('/content/drive/MyDrive/Spotify/data/1M_processed.csv',dtype=dtypes)
except:
    print('Failed to load grow')
    df=pd.read_csv('/content/drive/MyDrive/Spotify/data/1M_processed.csv',dtype=dtypes)


In [None]:
df.shape

(1163321, 20)

# Test

Extract playlist tracks and artist uri

In [None]:
def get_IDs (playlist_id):
    track_ids = []
    artist_id = []
    playlist=sp.playlist (playlist_id)
    for item in playlist['tracks']['items']:
        track=item['track']
        track_ids.append(track['id'])
        artist=item['track']['artists']
        artist_id.append(artist[0]['id'])
    return track_ids,artist_id

In [None]:
 playlist_id = 'spotify:playlist:1VDEf4vANEPRlrXVken86a'



In [None]:
track_ids,artist_id = get_IDs (playlist_id)
print (len(track_ids))
print (len(artist_id))

100
100


getting the unique URI and repeating the extraction features and preprocessing steps for the user's playlist (input)

In [None]:
artist_id_uni=list(set(artist_id))
track_ids_uni=list(set(track_ids))

In [None]:
audio_features=pd.DataFrame()
for i in tqdm(range(0,len(track_ids_uni),25)):
    try:
     track_feature = sp.audio_features(track_ids_uni[i:i+25])
     track_df = pd.DataFrame(track_feature)
     audio_features=pd.concat([audio_features,track_df],axis=0)
    except Exception as e:
        print(e)
        continue

100%|██████████| 4/4 [00:00<00:00, 12.44it/s]

expected string or bytes-like object





In [None]:
track_=pd.DataFrame()
for i in tqdm(range(0,len(track_ids_uni),25)):
    try:
        track_features = sp.tracks(track_ids_uni[i:i+25])
        for x in range(25):
            track_pop=pd.DataFrame([track_ids_uni[i+x]],columns=['Track_uri'])
            track_pop['Track_release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['Track_pop'] = track_features['tracks'][x]["popularity"]
            track_pop['Artist_uri']=track_features['tracks'][x]['artists'][0]['id']
            track_pop['Album_uri']=track_features['tracks'][x]['album']['id']
            track_=pd.concat([track_,track_pop],axis=0)
    except Exception as e:
        print(e)
        continue

 50%|█████     | 2/4 [00:00<00:00,  2.20it/s]

expected string or bytes-like object


100%|██████████| 4/4 [00:01<00:00,  2.98it/s]


In [None]:
artist_=pd.DataFrame()
for i in tqdm(range(0,len(artist_id_uni),25)):
    try:
        artist_features = sp.artists(artist_id_uni[i:i+25])
        for x in range(25):
            artist_df=pd.DataFrame([artist_id_uni[i+x]],columns=['Artist_uri'])
            artist_pop = artist_features['artists'][x]["popularity"]
            artist_genres = artist_features['artists'][x]["genres"]
            artist_df["Artist_pop"] = artist_pop
            if artist_genres:
                artist_df["genres"] = " ".join([re.sub(' ','_',i) for i in artist_genres])
            else:
                artist_df["genres"] = "unknown"
            artist_=pd.concat([artist_,artist_df],axis=0)
    except Exception as e:
        print(e)
        continue

  0%|          | 0/2 [00:00<?, ?it/s]

expected string or bytes-like object


100%|██████████| 2/2 [00:00<00:00,  6.69it/s]

list index out of range





In [None]:
test=pd.DataFrame(track_,columns=['Track_uri','Artist_uri','Album_uri'])

In [None]:
test.rename(columns = {'Track_uri':'track_uri','Artist_uri':'artist_uri','Album_uri':'album_uri'}, inplace = True)

In [None]:
audio_features.drop(columns=['type','uri','track_href','analysis_url'],axis=1,inplace=True)

In [None]:
test = pd.merge(test,audio_features, left_on = "track_uri", right_on= "id",how = 'outer')
test = pd.merge(test,track_, left_on = "track_uri", right_on= "Track_uri",how = 'outer')
test = pd.merge(test,artist_, left_on = "artist_uri", right_on= "Artist_uri",how = 'outer')

In [None]:
del audio_features,track_,artist_

In [None]:
test.rename(columns = {'genres':'Artist_genres'}, inplace = True)

In [None]:
test.drop(columns=['Track_uri','Artist_uri_x','Artist_uri_y','Album_uri','id'],axis=1,inplace=True)

In [None]:
test.dropna(axis=0,inplace=True)

In [None]:
test['Track_pop'] = test['Track_pop'].apply(lambda x: int(x/5))
test['Artist_pop'] = test['Artist_pop'].apply(lambda x: int(x/5))
test['Track_release_date'] = test['Track_release_date'].apply(lambda x: x.split('-')[0])
test['Track_release_date']=test['Track_release_date'].astype('int16')
test['Track_release_date'] = test['Track_release_date'].apply(lambda x: int(x/50))

In [None]:
test[['danceability', 'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo', 'time_signature']]=test[['danceability', 'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo','time_signature']].astype('float16')
test[['duration_ms']]=test[['duration_ms']].astype('float32')
test[['Track_release_date', 'Track_pop', 'Artist_pop']]=test[['Track_release_date', 'Track_pop', 'Artist_pop']].astype('int8')

In [None]:
currentdf=len(df)
currentdf

1163321

In [None]:
df=pd.concat([df,test],axis=0)

In [None]:
df.drop_duplicates(subset=['track_uri'],inplace=True,keep='last') ## keep last to keep the dataset updated

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
print('{} New Tracks Found'.format(len(df)-currentdf))

12 New Tracks Found


In [None]:
#saving the tracks if they weren't found in the dataset
if len(df)>currentdf:
    df.to_csv('/content/drive/MyDrive/Spotify/data/1M_processed.csv',index=False)
    print('{} New Found'.format(len(df)-currentdf))
    streamlit=df[df.Track_pop >0]             # dropped track with 0 popularity score to save space and ram for the final model
    ##### may need to adjust#####
    streamlit.to_csv('/content/drive/MyDrive/Spotify/data/streamlit.csv',index=False)
    del streamlit

12 New Found


In [None]:
df = df[~df['track_uri'].isin(test['track_uri'].values)]

In [None]:
test['Artist_genres'] = test['Artist_genres'].apply(lambda x: x.split(" "))
tfidf = TfidfVectorizer(max_features=5) #max_features=5
tfidf_matrix = tfidf.fit_transform(test['Artist_genres'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]

In [None]:
genre_df=genre_df.astype('float16')
test.drop(columns=['Artist_genres'],axis=1,inplace=True)

In [None]:
test = pd.concat([test.reset_index(drop=True), genre_df.reset_index(drop=True)],axis = 1)

In [None]:
test.isna().sum().sum()

0

In [None]:
df['Artist_genres'] = df['Artist_genres'].apply(lambda x: x.split(" "))
tfidf_matrix = tfidf.transform(df['Artist_genres'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]

In [None]:
genre_df=genre_df.astype('float16')
df.drop(columns=['Artist_genres'],axis=1,inplace=True)

In [None]:
df = pd.concat([df.reset_index(drop=True), genre_df.reset_index(drop=True)],axis = 1)

In [None]:
try:
    df.drop(columns=['genre|unknown'],axis=1,inplace=True)
    test.drop(columns=['genre|unknown'],axis=1,inplace=True)
except:
    print('genre|unknown not found')

genre|unknown not found


In [None]:
test.columns

Index(['track_uri', 'artist_uri', 'album_uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature',
       'Track_release_date', 'Track_pop', 'Artist_pop', 'genre|cantopop',
       'genre|mainland_chinese_pop', 'genre|mandopop',
       'genre|singaporean_mandopop', 'genre|singaporean_pop'],
      dtype='object')

In [None]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature',
       'Track_release_date', 'Track_pop', 'Artist_pop', 'genre|cantopop',
       'genre|mainland_chinese_pop', 'genre|mandopop',
       'genre|singaporean_mandopop', 'genre|singaporean_pop'],
      dtype='object')

In [None]:
sc=MinMaxScaler()
df[df.columns[3:19]] = sc.fit_transform(df.iloc[:,3:19]) #in the saved dataset get all rows, and columns including audio features. note that genre is not included
pickle.dump(sc, open('/content/drive/MyDrive/Spotify/data/sc.sav', 'wb'))

#prepare a new data frame call df_new for PCA analysis
df_new = df

In [None]:
test[test.columns[3:19]] = sc.transform(test.iloc[:,3:19]) #based on input play list, get all rows, and columns including audio features. note that genre is not included

#prepare a new data frame call df_new for PCA analysis
test_new = test

In [None]:
playvec=pd.DataFrame(test.sum(axis=0)).T
playvec

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,genre|cantopop,genre|mainland_chinese_pop,genre|mandopop,genre|singaporean_mandopop,genre|singaporean_pop
0,3QOQ8HlMpJTupsoj5okuof0CvHKdAXglZGyNCtoZ7JCq6L...,1cg0bYpP5e2DNG0RgK2CMN1cg0bYpP5e2DNG0RgK2CMN1c...,4IlbFUwa4Fd5laEAD3H6lQ7nD96CUbgCyzRHxbftQhpK4I...,18.891136,18.148926,12.454546,30.469963,32.0,1.58082,21.723438,...,1.685286,29.799995,37.974998,15.95,19.6,6.539062,6.613281,21.4375,6.699219,6.699219


In [None]:
df['sim']=cosine_similarity(df.drop(['track_uri', 'artist_uri', 'album_uri'], axis = 1),playvec.drop(['track_uri', 'artist_uri', 'album_uri'], axis = 1)) #find cosine similarity between dataset and playlist in general
df['sim2']=cosine_similarity(df.iloc[:,16:-1],playvec.iloc[:,16:])  #find cosine similarity between dataset and playlist in terms of track & artist genres
df['sim3']=cosine_similarity(df.iloc[:,19:-2],playvec.iloc[:,19:])  #find cosine similarity between dataset and playlist in terms of genres
df['sim4']=(df['sim']+df['sim2'])/2
#sort based on similarity score, high correlated genres will be pioritzed, then artist & track popularity and audio features
df = df.sort_values(['sim3','sim4'],ascending = False,kind='stable')

#get the list of track uris, we are output 20 tracks
qq=df.groupby('artist_uri').head(2).track_uri.head(20)     #to limit recmmendation by same artist

#get recommendation track detail
aa=sp.tracks(qq[0:20])
Fresult=pd.DataFrame()
for i in range(20):
    result=pd.DataFrame([i])
    result['track_name']=aa['tracks'][i]['name']
    result['artist_name']=aa['tracks'][i]['artists'][0]['name']
    #result['url']=aa['tracks'][i]['external_urls']['spotify']
    result['pop'] = aa['tracks'][i]["popularity"]
    #result['image']=aa['tracks'][i]['album']['images'][1]['url']
    Fresult=pd.concat([Fresult,result],axis=0)
Fresult

Unnamed: 0,0,track_name,artist_name,pop
0,0,慢慢等,WeiBird,58
0,1,還是會,WeiBird,55
0,2,帶我走,Rainie Yang,55
0,3,暗號,Jay Chou,55
0,4,愛 請問怎麼走,A-Lin,46
0,5,擱淺,Jay Chou,66
0,6,那是你離開了北京的生活,Joker Xue,45
0,7,以後別做朋友,Eric Chou,64
0,8,木偶人,Joker Xue,43
0,9,一個人想著一個人,Pets Tseng,56


In [None]:
#### Revision here

# Select the numeric features
numeric_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                    'duration_ms', 'time_signature', 'Track_release_date', 'Track_pop', 'Artist_pop']

# Extract the numeric features for PCA
df_new_numeric = df_new[numeric_features]
playvec_new_numeric = playvec[numeric_features]

# Initialize PCA, keep 95% of the variance and Fit PCA on the numeric features
pca = PCA(n_components=0.95)
pca.fit(df_new_numeric)

# Transform both df and df_test numeric features
df_pca = pca.transform(df_new_numeric)
playvec_pca = pca.transform(playvec_new_numeric)

# Convert the PCA components into a DataFrame
df_pca_df = pd.DataFrame(df_pca, index=df_new.index)
playvec_pca_df = pd.DataFrame(playvec_pca, index=playvec.index)

In [None]:
#### Find cosine similairty based on PCA model
df_new['sim_pca'] = cosine_similarity(df_pca_df, playvec_pca_df) # Calculate the cosine similarity
df_new['sim_genres'] = cosine_similarity(df_new.loc[:, df_new.columns.str.startswith('genre')], playvec.loc[:, playvec.columns.str.startswith('genre')]) # Calculate the cosine similarity for genres
df_new['sim_combined'] = (df_new['sim_pca'] + df_new['sim_genres']) / 2  # Combine PCA similarity with genre similarity to get a combined similarity score

#sort based on similarity score, but give more weight/focus on genre first then PCA
df_new = df_new.sort_values(['sim_genres', 'sim_combined'], ascending = False, kind='stable')

#get the list of track uris
qq=df_new.groupby('artist_uri').head(2).track_uri.head(20)

#get recommendation track detail
aa=sp.tracks(qq[0:20])
Fresult=pd.DataFrame()
for i in range(20):
    result=pd.DataFrame([i])
    result['track_name']=aa['tracks'][i]['name']
    result['artist_name']=aa['tracks'][i]['artists'][0]['name']
    #result['url']=aa['tracks'][i]['external_urls']['spotify']
    result['pop'] = aa['tracks'][i]["popularity"]
    #result['image']=aa['tracks'][i]['album']['images'][1]['url']
    Fresult=pd.concat([Fresult,result],axis=0)
Fresult

Unnamed: 0,0,track_name,artist_name,pop
0,0,過程,Soft Lipa,45
0,1,瘋狂世界,Mayday,36
0,2,夜間漫遊,Soft Lipa,26
0,3,我會想念妳,Zhang Zhen Yue,29
0,4,嫁給我吧,玖壹壹,35
0,5,辛德瑞拉,Penny Tai,28
0,6,再見,Zhang Zhen Yue,53
0,7,不哭,Cyndi Wang,0
0,8,就是愛,Jolin Tsai,37
0,9,單眼皮,Rainie Yang,22


In [None]:
Spotifyresult=pd.DataFrame()
for i in range(len(test)-1):
    if len(Spotifyresult)>=20:
        break
    ff=sp.recommendations(seed_tracks=list(test.track_uri[1+i:5+i]),limit=2)
    for z in range(2):
        result=pd.DataFrame([z+(2*i)+1])
        result['track_name']=ff['tracks'][z]['name']
        result['artist_name']=ff['tracks'][z]['artists'][0]['name']
        result['pop'] = ff['tracks'][z]["popularity"]
        #result['uri']=ff['tracks'][z]['id']
        #result['url']=ff['tracks'][z]['external_urls']['spotify']
        #result['image']=ff['tracks'][z]['album']['images'][1]['url']
        Spotifyresult=pd.concat([Spotifyresult,result],axis=0)
Spotifyresult

Unnamed: 0,0,track_name,artist_name,pop
0,1,猜不透,Della,60
0,2,愛不單行,Show Luo,50
0,3,走著走著就散了,Ada Zhuang,49
0,4,國境之南,Fan Yi Chen,53
0,5,親愛的那不是愛情,Angela Chang,58
0,6,失落沙洲,LaLa Hsu,59
0,7,天外來物,Joker Xue,62
0,8,原諒我,Jam Hsiao,49
0,9,I Believe,Fan Yi Chen,62
0,10,"半句再見 - From ""At Café 6"" / Main Theme Song",Stefanie Sun,53


In [None]:
#df['sim']=cosine_similarity(df.iloc[:,3:16],playvec.iloc[:,3:16])  #auido features cosine similaritiy
#df['sim2']=cosine_similarity(df.loc[:, df.columns.str.startswith('T')|df.columns.str.startswith('A')],playvec.loc[:, playvec.columns.str.startswith('T')|playvec.columns.str.startswith('A')])   #artist & track popularity cosine similaritiy
#df['sim3']=cosine_similarity(df.loc[:, df.columns.str.startswith('genre')],playvec.loc[:, playvec.columns.str.startswith('genre')])      #genre cosine similaritiy

##equally consider
#df['sim4']=(df['sim']+df['sim2']+df['sim3'])/3
#df = df.sort_values(['sim4'],ascending = False,kind='stable')

##get the list of track uris, we are output 20 tracks
#qq=df.groupby('artist_uri').head(5).track_uri.head(20)
#aa=sp.tracks(qq[0:20])
#Fresult=pd.DataFrame()
#for i in range(20):
#    result=pd.DataFrame([i])
#    result['track_name']=aa['tracks'][i]['name']
#    result['artist_name']=aa['tracks'][i]['artists'][0]['name']
#    #result['url']=aa['tracks'][i]['external_urls']['spotify']
#    result['pop'] = aa['tracks'][i]["popularity"]
#    #result['image']=aa['tracks'][i]['album']['images'][1]['url']
#    Fresult=pd.concat([Fresult,result],axis=0)
#Fresult

In [None]:
#df['sim']=cosine_similarity(df.drop(['track_uri', 'artist_uri', 'album_uri'], axis = 1),playvec.drop(['track_uri', 'artist_uri', 'album_uri'], axis = 1)) #find cosine similarity between dataset and playlist in general
#df['sim2']=cosine_similarity(df.iloc[:,16:-1],playvec.iloc[:,16:])  #find cosine similarity between dataset and playlist in terms of track & artist genres
#df['sim3']=cosine_similarity(df.iloc[:,19:-2],playvec.iloc[:,19:])  #find cosine similarity between dataset and playlist in terms of genres

##based on similarity score, high correlated genres will be pioritzed, then artist & track popularity, last the audio features
#df = df.sort_values(['sim3','sim2','sim'],ascending = False,kind='stable')

##get the list of track uris, we are output 20 tracks
#qq=df.groupby('artist_uri').head(5).track_uri.head(20)     #to limit recmmendation by same artist

##get recommendation track detail
#aa=sp.tracks(qq[0:20])
#Fresult=pd.DataFrame()
#for i in range(20):
#    result=pd.DataFrame([i])
#    result['track_name']=aa['tracks'][i]['name']
#    result['artist_name']=aa['tracks'][i]['artists'][0]['name']
#    #result['url']=aa['tracks'][i]['external_urls']['spotify']
#    result['pop'] = aa['tracks'][i]["popularity"]
#    #result['image']=aa['tracks'][i]['album']['images'][1]['url']
#    Fresult=pd.concat([Fresult,result],axis=0)
#Fresult