In [1]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from skimage import io
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("./data/1M_processed.csv")

In [3]:
df.head()

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,Artist_genres
0,2THyGFz2KFSnnjilRrTDIM,2wIVse2owClT7go1WT98tk,5eNmebFdSZTcqExwoijGOO,0.713,0.632,4.0,-7.25,0.0,0.114,0.00822,1.7e-05,0.0978,0.474,99.8,210600.0,4.0,40,0,13,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...
1,3BgwOI1jT8l6dFeAiESJVM,2wIVse2owClT7go1WT98tk,6UkdyvPElK6JDkyeRClbI2,0.748,0.0887,5.0,-17.66,1.0,0.959,0.778,0.0,0.27,0.643,75.1,24373.0,4.0,39,0,13,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...
2,4DZpzJhrt8SG22hsyruUB1,26dSoYclwsYLMAKD3tpOr4,0eXGrEtsH0WVJnqp5imXAs,0.657,0.645,2.0,-6.52,0.0,0.0603,0.00592,0.000199,0.236,0.658,92.0,198640.0,4.0,40,0,15,dance_pop pop
3,3JyhzOBrTyTXHSAFRu0wKV,26dSoYclwsYLMAKD3tpOr4,0eXGrEtsH0WVJnqp5imXAs,0.367,0.652,6.0,-7.17,1.0,0.109,0.0332,6.9e-05,0.0947,0.156,112.4,199987.0,4.0,40,0,15,dance_pop pop
4,36b687iXNP8g84ulUXmPA7,26dSoYclwsYLMAKD3tpOr4,0eXGrEtsH0WVJnqp5imXAs,0.753,0.456,2.0,-5.89,1.0,0.0419,0.221,0.0,0.449,0.523,112.5,235293.0,3.0,40,0,15,dance_pop pop


In [4]:
# pip install spotipy

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

In [6]:
# Open the YAML file that contains the Spotify API credentials.
stream= open("Spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

In [7]:
sp

<spotipy.client.Spotify at 0x1b273534f10>

# Importing the dataset

In [8]:
dtypes = {'track_uri': 'object', 'artist_uri': 'object', 'album_uri': 'object', 'danceability': 'float16', 'energy': 'float16', 'key': 'float16',
               'loudness': 'float16', 'mode': 'float16', 'speechiness': 'float16', 'acousticness': 'float16', 'instrumentalness': 'float16',
               'liveness': 'float16', 'valence': 'float16', 'tempo': 'float16', 'duration_ms': 'float32', 'time_signature': 'float16',
               'Track_release_date': 'int8', 'Track_pop': 'int8', 'Artist_pop': 'int8', 'Artist_genres': 'object'}
try:
    df=pd.read_csv('./data/1M_processed.csv',dtype=dtypes)
except:
    print('Failed to load grow')
    df=pd.read_csv('./data/1M_processed.csv',dtype=dtypes)

In [9]:
df.shape

(1163393, 20)

# Test

Extract playlist tracks and artist uri

In [10]:
def get_IDs (playlist_id):
    track_ids = []
    artist_id = []
    playlist=sp.playlist (playlist_id)
    for item in playlist['tracks']['items']:
        track=item['track']
        track_ids.append(track['id'])
        artist=item['track']['artists']
        artist_id.append(artist[0]['id'])
    return track_ids,artist_id

In [11]:
 #playlist_id = 'spotify:playlist:1VDEf4vANEPRlrXVken86a'
playlist_id ='spotify:playlist:37i9dQZF1E8NgXcf5gQPXv'

In [12]:
track_ids,artist_id = get_IDs (playlist_id)
print (len(track_ids))
print (len(artist_id))

50
50


getting the unique URI and repeating the extraction features and preprocessing steps for the user's playlist (input)

In [13]:
artist_id_uni=list(set(artist_id))
track_ids_uni=list(set(track_ids))

In [14]:
audio_features=pd.DataFrame()
for i in tqdm(range(0,len(track_ids_uni),25)):
    try:
     track_feature = sp.audio_features(track_ids_uni[i:i+25])
     track_df = pd.DataFrame(track_feature)
     audio_features=pd.concat([audio_features,track_df],axis=0)
    except Exception as e:
        print(e)
        continue

100%|██████████| 2/2 [00:00<00:00,  7.67it/s]


In [15]:
track_=pd.DataFrame()
for i in tqdm(range(0,len(track_ids_uni),25)):
    try:
        track_features = sp.tracks(track_ids_uni[i:i+25])
        for x in range(25):
            track_pop=pd.DataFrame([track_ids_uni[i+x]],columns=['Track_uri'])
            track_pop['Track_release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['Track_pop'] = track_features['tracks'][x]["popularity"]
            track_pop['Artist_uri']=track_features['tracks'][x]['artists'][0]['id']
            track_pop['Album_uri']=track_features['tracks'][x]['album']['id']
            track_=pd.concat([track_,track_pop],axis=0)
    except Exception as e:
        print(e)
        continue

100%|██████████| 2/2 [00:00<00:00,  3.07it/s]


In [16]:
artist_=pd.DataFrame()
for i in tqdm(range(0,len(artist_id_uni),25)):
    try:
        artist_features = sp.artists(artist_id_uni[i:i+25])
        for x in range(25):
            artist_df=pd.DataFrame([artist_id_uni[i+x]],columns=['Artist_uri'])
            artist_pop = artist_features['artists'][x]["popularity"]
            artist_genres = artist_features['artists'][x]["genres"]
            artist_df["Artist_pop"] = artist_pop
            if artist_genres:
                artist_df["genres"] = " ".join([re.sub(' ','_',i) for i in artist_genres])
            else:
                artist_df["genres"] = "unknown"
            artist_=pd.concat([artist_,artist_df],axis=0)
    except Exception as e:
        print(e)
        continue

100%|██████████| 2/2 [00:00<00:00,  3.87it/s]

list index out of range





In [17]:
track_.head()

Unnamed: 0,Track_uri,Track_release_date,Track_pop,Artist_uri,Album_uri
0,6Bwu10ojlaFg7BGfHxvTWO,2021-10-01,64,0i5iO6icb7kxg48thi9gBM,4DiqVS0SU6o0jrgRMbfqxi
0,5qjtbs7pH1sgG94DlRcmFJ,2022-07-22,40,6JYq1icPMmdJ9jxyXDOieP,4pbNoIVR1PzWtRs7u7oRQV
0,5XJWVLhYE2tsa5vXlcgT3N,2022-10-13,67,4bw2Am3p9ji3mYsXNXtQcd,6U2Ncrmi1EeBQQz2NNgh1M
0,6Z6FqnImFvNvxg9aHW2HKz,2022-12-16,52,43qxAkuKFB6fMNSeS5dO7Z,731ECu5lhhgFIDPtbgOwe3
0,7dMTCS9BLzBqYTlAuHP8TM,2022-08-12,73,5xSx2FM8mQnrfgM1QsHniB,2SGONYwprYHZruYFhQYiFC


In [18]:
artist_.head()

Unnamed: 0,Artist_uri,Artist_pop,genres
0,0AspLZGQkP38yddNoD0pLn,55,latin_viral_pop mexican_pop reggaeton_mexicano
0,1QgrwYywvDuC43MDtR8cqq,60,unknown
0,7uQ1D2NNHs5cUL3CLKRbia,61,pop_venezolano
0,0haZhu4fFKt0Ag94kZDiz2,66,latin_pop latin_viral_pop mexican_pop reggaeton
0,5xSx2FM8mQnrfgM1QsHniB,69,latin_pop latin_viral_pop mexican_pop


In [19]:
audio_features.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.719,0.544,0,-10.526,1,0.0292,0.21,2.5e-05,0.122,0.677,127.032,audio_features,6Bwu10ojlaFg7BGfHxvTWO,spotify:track:6Bwu10ojlaFg7BGfHxvTWO,https://api.spotify.com/v1/tracks/6Bwu10ojlaFg...,https://api.spotify.com/v1/audio-analysis/6Bwu...,203973,4
1,0.413,0.789,7,-3.062,1,0.222,0.501,0.0,0.267,0.386,185.534,audio_features,5qjtbs7pH1sgG94DlRcmFJ,spotify:track:5qjtbs7pH1sgG94DlRcmFJ,https://api.spotify.com/v1/tracks/5qjtbs7pH1sg...,https://api.spotify.com/v1/audio-analysis/5qjt...,171614,4
2,0.804,0.793,0,-2.778,1,0.0777,0.133,0.0,0.136,0.454,84.004,audio_features,5XJWVLhYE2tsa5vXlcgT3N,spotify:track:5XJWVLhYE2tsa5vXlcgT3N,https://api.spotify.com/v1/tracks/5XJWVLhYE2ts...,https://api.spotify.com/v1/audio-analysis/5XJW...,209268,4
3,0.778,0.681,2,-5.836,0,0.0301,0.293,0.0,0.101,0.927,148.07,audio_features,6Z6FqnImFvNvxg9aHW2HKz,spotify:track:6Z6FqnImFvNvxg9aHW2HKz,https://api.spotify.com/v1/tracks/6Z6FqnImFvNv...,https://api.spotify.com/v1/audio-analysis/6Z6F...,172515,4
4,0.797,0.677,9,-4.163,0,0.0857,0.101,0.0,0.441,0.919,130.034,audio_features,7dMTCS9BLzBqYTlAuHP8TM,spotify:track:7dMTCS9BLzBqYTlAuHP8TM,https://api.spotify.com/v1/tracks/7dMTCS9BLzBq...,https://api.spotify.com/v1/audio-analysis/7dMT...,219320,4


In [20]:
test=pd.DataFrame(track_,columns=['Track_uri','Artist_uri','Album_uri'])

In [21]:
test.head()

Unnamed: 0,Track_uri,Artist_uri,Album_uri
0,6Bwu10ojlaFg7BGfHxvTWO,0i5iO6icb7kxg48thi9gBM,4DiqVS0SU6o0jrgRMbfqxi
0,5qjtbs7pH1sgG94DlRcmFJ,6JYq1icPMmdJ9jxyXDOieP,4pbNoIVR1PzWtRs7u7oRQV
0,5XJWVLhYE2tsa5vXlcgT3N,4bw2Am3p9ji3mYsXNXtQcd,6U2Ncrmi1EeBQQz2NNgh1M
0,6Z6FqnImFvNvxg9aHW2HKz,43qxAkuKFB6fMNSeS5dO7Z,731ECu5lhhgFIDPtbgOwe3
0,7dMTCS9BLzBqYTlAuHP8TM,5xSx2FM8mQnrfgM1QsHniB,2SGONYwprYHZruYFhQYiFC


In [22]:
test.rename(columns = {'Track_uri':'track_uri','Artist_uri':'artist_uri','Album_uri':'album_uri'}, inplace = True)

In [23]:
audio_features.drop(columns=['type','uri','track_href','analysis_url'],axis=1,inplace=True)

In [24]:
test = pd.merge(test,audio_features, left_on = "track_uri", right_on= "id",how = 'outer')
test = pd.merge(test,track_, left_on = "track_uri", right_on= "Track_uri",how = 'outer')
test = pd.merge(test,artist_, left_on = "artist_uri", right_on= "Artist_uri",how = 'outer')

In [25]:
test.shape

(50, 25)

In [None]:
# del audio_features,track_,artist_

In [None]:
test.rename(columns = {'genres':'Artist_genres'}, inplace = True)

In [None]:
test.drop(columns=['Track_uri','Artist_uri_x','Artist_uri_y','Album_uri','id'],axis=1,inplace=True)

In [None]:
test.dropna(axis=0,inplace=True)

In [None]:
test['Track_pop'] = test['Track_pop'].apply(lambda x: int(x/5))
test['Artist_pop'] = test['Artist_pop'].apply(lambda x: int(x/5))
test['Track_release_date'] = test['Track_release_date'].apply(lambda x: x.split('-')[0])
test['Track_release_date']=test['Track_release_date'].astype('int16')
test['Track_release_date'] = test['Track_release_date'].apply(lambda x: int(x/50))

In [None]:
test[['danceability', 'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo', 'time_signature']]=test[['danceability', 'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo','time_signature']].astype('float16')
test[['duration_ms']]=test[['duration_ms']].astype('float32')
test[['Track_release_date', 'Track_pop', 'Artist_pop']]=test[['Track_release_date', 'Track_pop', 'Artist_pop']].astype('int8')

In [None]:
currentdf=len(df)
currentdf

In [None]:
df=pd.concat([df,test],axis=0)

In [None]:
df.drop_duplicates(subset=['track_uri'],inplace=True,keep='last') ## keep last to keep the dataset updated

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
print('{} New Tracks Found'.format(len(df)-currentdf))

In [None]:
#saving the tracks if they weren't found in the dataset
if len(df)>currentdf:
    df.to_csv('./data/1M_processed.csv',index=False)
    print('{} New Found'.format(len(df)-currentdf))
    streamlit=df[df.Track_pop >0]             # dropped track with 0 popularity score to save space and ram for the final model
    ##### may need to adjust#####
    streamlit.to_csv('./data/streamlit.csv',index=False)
    del streamlit

In [None]:
df = df[~df['track_uri'].isin(test['track_uri'].values)]

In [None]:
test['Artist_genres'] = test['Artist_genres'].apply(lambda x: x.split(" "))
tfidf = TfidfVectorizer(max_features=5) #max_features=5
tfidf_matrix = tfidf.fit_transform(test['Artist_genres'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]

In [None]:
genre_df=genre_df.astype('float16')
test.drop(columns=['Artist_genres'],axis=1,inplace=True)

In [None]:
test = pd.concat([test.reset_index(drop=True), genre_df.reset_index(drop=True)],axis = 1)

In [None]:
test.isna().sum().sum()

In [None]:
df['Artist_genres'] = df['Artist_genres'].apply(lambda x: x.split(" "))
tfidf_matrix = tfidf.transform(df['Artist_genres'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]

In [None]:
genre_df=genre_df.astype('float16')
df.drop(columns=['Artist_genres'],axis=1,inplace=True)

In [None]:
df = pd.concat([df.reset_index(drop=True), genre_df.reset_index(drop=True)],axis = 1)

In [None]:
try:
    df.drop(columns=['genre|unknown'],axis=1,inplace=True)
    test.drop(columns=['genre|unknown'],axis=1,inplace=True)
except:
    print('genre|unknown not found')

In [None]:
test.columns

In [None]:
df.columns

In [None]:
sc=MinMaxScaler()
df[df.columns[3:19]] = sc.fit_transform(df.iloc[:,3:19]) #in the saved dataset get all rows, and columns including audio features. note that genre is not included
pickle.dump(sc, open('./data/sc.sav', 'wb'))

#prepare a new data frame call df_new for PCA analysis
df_new = df

In [None]:
df.shape

In [None]:
test.shape

In [None]:
test[test.columns[3:19]] = sc.transform(test.iloc[:,3:19]) #based on input play list, get all rows, and columns including audio features. note that genre is not included

#prepare a new data frame call df_new for PCA analysis
test_new = test

In [None]:
test_new.shape

In [None]:
test_new.head()

In [None]:
playvec=pd.DataFrame(test.sum(axis=0)).T
playvec

In [None]:
df.head()

# Recommender System

In [None]:
#### Revision here

# Select the numeric features
numeric_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                    'duration_ms', 'time_signature', 'Track_release_date', 'Track_pop', 'Artist_pop']

# Extract the numeric features for PCA
df_new_numeric = df_new[numeric_features]
playvec_new_numeric = playvec[numeric_features]

# Initialize PCA, keep 95% of the variance and Fit PCA on the numeric features
pca = PCA(n_components=0.95)
pca.fit(df_new_numeric)

# Transform both df and df_test numeric features
df_pca = pca.transform(df_new_numeric)
playvec_pca = pca.transform(playvec_new_numeric)

# Convert the PCA components into a DataFrame
df_pca_df = pd.DataFrame(df_pca, index=df_new.index)
playvec_pca_df = pd.DataFrame(playvec_pca, index=playvec.index)

In [None]:
#### Find cosine similairty based on PCA model
df_new['sim_pca'] = cosine_similarity(df_pca_df, playvec_pca_df) # Calculate the cosine similarity
df_new['sim_genres'] = cosine_similarity(df_new.loc[:, df_new.columns.str.startswith('genre')], playvec.loc[:, playvec.columns.str.startswith('genre')]) # Calculate the cosine similarity for genres
df_new['sim_combined'] = (df_new['sim_pca'] + df_new['sim_genres']) / 2  # Combine PCA similarity with genre similarity to get a combined similarity score

#sort based on similarity score, but give more weight/focus on genre first then PCA
df_new = df_new.sort_values(['sim_genres', 'sim_combined'], ascending = False, kind='stable')

#get the list of track uris
qq=df_new.groupby('artist_uri').head(2).track_uri.head(20)

#get recommendation track detail
aa=sp.tracks(qq[0:20])
Fresult=pd.DataFrame()
for i in range(20):
    result=pd.DataFrame([i])
    result['track_name']=aa['tracks'][i]['name']
    result['artist_name']=aa['tracks'][i]['artists'][0]['name']
    #result['url']=aa['tracks'][i]['external_urls']['spotify']
    result['pop'] = aa['tracks'][i]["popularity"]
    #result['image']=aa['tracks'][i]['album']['images'][1]['url']
    Fresult=pd.concat([Fresult,result],axis=0)
Fresult

In [None]:
Spotifyresult=pd.DataFrame()
for i in range(len(test)-1):
    if len(Spotifyresult)>=20:
        break
    ff=sp.recommendations(seed_tracks=list(test.track_uri[1+i:5+i]),limit=2)
    for z in range(2):
        result=pd.DataFrame([z+(2*i)+1])
        result['track_name']=ff['tracks'][z]['name']
        result['artist_name']=ff['tracks'][z]['artists'][0]['name']
        result['pop'] = ff['tracks'][z]["popularity"]
        #result['uri']=ff['tracks'][z]['id']
        #result['url']=ff['tracks'][z]['external_urls']['spotify']
        #result['image']=ff['tracks'][z]['album']['images'][1]['url']
        Spotifyresult=pd.concat([Spotifyresult,result],axis=0)
Spotifyresult