# Importing the libraries

In [1]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
stream= open("spotify/spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [3]:
df=pd.read_csv('Data/1mV3.csv')
artist_features=pd.read_csv('Data/artist_features.csv')
audio_features=pd.read_csv('Data/audio_features.csv')
track_features=pd.read_csv('Data/track_features.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,track_uri,artist_uri,album_uri
0,0,3uvsVUrAaGQJCTEUR1S3Sx,0ypTT9UqAU5sZpPo5JZmjR,2kIoMj5Ht14l2PnNRa2abC
1,0,0heE5tAAaDQmnGhVDImPl2,0h3YCmvRJ2jqt4jFiR6nGL,3g7TTE6375PGIBsM9Tlk9I
2,0,3omXshBamrREltcf24gYDC,6VDdCwrBM4qQaGxoAyxyJC,2H09itV5a5yUcGyk9u9HwY
3,0,6TYWE19e35N7Bn5heHwyY6,4uNv6RD2YXwoaKgHfJZkkL,38y7zXf95O9Afh7ZXIoyq1
4,0,1xznGGDReH1oQq0xzbwXa3,3TVXtAsR1Inumwj472S9r4,3hARKC8cinq3mZLLAEaBh9


In [5]:
audio_features.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.523,0.422,3,-10.385,0,0.027,0.783,0.00514,0.0898,0.145,129.876,audio_features,3uvsVUrAaGQJCTEUR1S3Sx,spotify:track:3uvsVUrAaGQJCTEUR1S3Sx,https://api.spotify.com/v1/tracks/3uvsVUrAaGQJ...,https://api.spotify.com/v1/audio-analysis/3uvs...,242060,4
1,0.493,0.594,8,-4.404,0,0.0378,0.256,0.0,0.0759,0.304,123.751,audio_features,0heE5tAAaDQmnGhVDImPl2,spotify:track:0heE5tAAaDQmnGhVDImPl2,https://api.spotify.com/v1/tracks/0heE5tAAaDQm...,https://api.spotify.com/v1/audio-analysis/0heE...,225515,4
2,0.468,0.692,2,-4.015,1,0.0295,0.0202,2.39e-06,0.523,0.561,78.009,audio_features,3omXshBamrREltcf24gYDC,spotify:track:3omXshBamrREltcf24gYDC,https://api.spotify.com/v1/tracks/3omXshBamrRE...,https://api.spotify.com/v1/audio-analysis/3omX...,200360,4
3,0.553,0.564,9,-7.072,0,0.0418,0.039,0.0,0.318,0.31,149.953,audio_features,6TYWE19e35N7Bn5heHwyY6,spotify:track:6TYWE19e35N7Bn5heHwyY6,https://api.spotify.com/v1/tracks/6TYWE19e35N7...,https://api.spotify.com/v1/audio-analysis/6TYW...,232643,4
4,0.791,0.619,1,-5.886,1,0.0532,0.00784,0.00423,0.351,0.371,103.989,audio_features,1xznGGDReH1oQq0xzbwXa3,spotify:track:1xznGGDReH1oQq0xzbwXa3,https://api.spotify.com/v1/tracks/1xznGGDReH1o...,https://api.spotify.com/v1/audio-analysis/1xzn...,173987,4


In [6]:
artist_features

Unnamed: 0,0,artist_pop,genres
0,0ypTT9UqAU5sZpPo5JZmjR,35,british_singer-songwriter
1,0,artist_pop,genres
2,0h3YCmvRJ2jqt4jFiR6nGL,53,norwegian_pop
3,0,artist_pop,genres
4,6VDdCwrBM4qQaGxoAyxyJC,59,indie_rock modern_alternative_rock modern_rock...
...,...,...,...
110752,4mAsWDGLUIEdo6imU77WG6,61,italian_adult_pop italian_pop italian_reggae
110753,0,artist_pop,genres
110754,0dUnhVUif1WspAfS7QPTQm,41,italian_adult_pop italian_pop italian_pop_rock
110755,0,artist_pop,genres


In [7]:
artist_features = artist_features.loc[~(artist_features.index % 2 == 1)]

In [8]:
artist_features.reset_index(drop=True, inplace=True)

In [9]:
artist_features

Unnamed: 0,0,artist_pop,genres
0,0ypTT9UqAU5sZpPo5JZmjR,35,british_singer-songwriter
1,0h3YCmvRJ2jqt4jFiR6nGL,53,norwegian_pop
2,6VDdCwrBM4qQaGxoAyxyJC,59,indie_rock modern_alternative_rock modern_rock...
3,4uNv6RD2YXwoaKgHfJZkkL,61,indie_electropop indie_poptimism
4,3TVXtAsR1Inumwj472S9r4,96,canadian_hip_hop canadian_pop hip_hop rap toro...
...,...,...,...
55374,2BBBoWQhHesxgR48zsCEE6,25,unknown
55375,2nftqfbLohpDYzY8VUlvbm,50,italian_adult_pop italian_pop
55376,4mAsWDGLUIEdo6imU77WG6,61,italian_adult_pop italian_pop italian_reggae
55377,0dUnhVUif1WspAfS7QPTQm,41,italian_adult_pop italian_pop italian_pop_rock


In [10]:
artist_features.rename(columns = {'0':'Artist_uri', 'artist_pop': 'Artist_pop',
                                 'genres':'Artist_genres'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_features.rename(columns = {'0':'Artist_uri', 'artist_pop': 'Artist_pop',


In [13]:
track_features

Unnamed: 0,0,release_date,pop
0,3uvsVUrAaGQJCTEUR1S3Sx,2016-02-22,0
1,0,release_date,pop
2,0heE5tAAaDQmnGhVDImPl2,2016-01-29,63
3,0,release_date,pop
4,3omXshBamrREltcf24gYDC,2014-10-31,68
...,...,...,...
591056,6ecs6f2s14nsMHufZ8JGg7,2005,0
591057,7FKtGleXIAoZdOmlOMWX7m,2015-07-31,0
591058,0fcjpuekovpysHl9dAH3u1,2000-07-17,19
591059,6c3aUrNVj9aZy2UBcAuNBC,1994-02-28,12


In [14]:
track_features = track_features.loc[~(track_features.index % 2 == 1)]

In [15]:
track_features.reset_index(drop=True, inplace=True)

In [16]:
track_features

Unnamed: 0,0,release_date,pop
0,3uvsVUrAaGQJCTEUR1S3Sx,2016-02-22,0
1,0heE5tAAaDQmnGhVDImPl2,2016-01-29,63
2,3omXshBamrREltcf24gYDC,2014-10-31,68
3,6TYWE19e35N7Bn5heHwyY6,2016-04-29,31
4,1xznGGDReH1oQq0xzbwXa3,2016-05-06,1
...,...,...,...
295526,1NfdTbxasqJawyNEIEHCRY,2016-01-01,22
295527,7tFhes1fDmen8DW9AFC8iv,2017-03-10,36
295528,6ecs6f2s14nsMHufZ8JGg7,2005,0
295529,0fcjpuekovpysHl9dAH3u1,2000-07-17,19


In [17]:
track_features.rename(columns = {'0':'Track_uri', 'release_date': 'Track_release_date',
                                'pop':'Track_pop'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  track_features.rename(columns = {'0':'Track_uri', 'release_date': 'Track_release_date',


In [18]:
track_features

Unnamed: 0,Track_uri,Track_release_date,Track_pop
0,3uvsVUrAaGQJCTEUR1S3Sx,2016-02-22,0
1,0heE5tAAaDQmnGhVDImPl2,2016-01-29,63
2,3omXshBamrREltcf24gYDC,2014-10-31,68
3,6TYWE19e35N7Bn5heHwyY6,2016-04-29,31
4,1xznGGDReH1oQq0xzbwXa3,2016-05-06,1
...,...,...,...
295526,1NfdTbxasqJawyNEIEHCRY,2016-01-01,22
295527,7tFhes1fDmen8DW9AFC8iv,2017-03-10,36
295528,6ecs6f2s14nsMHufZ8JGg7,2005,0
295529,0fcjpuekovpysHl9dAH3u1,2000-07-17,19


# Merging all dataframes

In [19]:
df = pd.merge(df,audio_features, left_on = "track_uri", right_on= "id",how = 'outer')

In [20]:
df = pd.merge(df,track_features, left_on = "track_uri", right_on= "Track_uri",how = 'outer')

In [21]:
df = pd.merge(df,artist_features, left_on = "artist_uri", right_on= "Artist_uri",how = 'outer')

# Handling missing data 

In [22]:
df.isna().sum()

Unnamed: 0            2962
track_uri             2962
artist_uri            2962
album_uri             2962
danceability             9
energy                   9
key                      9
loudness                 9
mode                     9
speechiness              9
acousticness             9
instrumentalness         9
liveness                 9
valence                  9
tempo                    9
type                     9
id                       9
uri                      9
track_href               9
analysis_url             9
duration_ms              9
time_signature           9
Track_uri             3349
Track_release_date    3349
Track_pop             3349
Artist_uri            2962
Artist_pop            2962
Artist_genres         2962
dtype: int64

In [23]:
df.columns

Index(['Unnamed: 0', 'track_uri', 'artist_uri', 'album_uri', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri',
       'track_href', 'analysis_url', 'duration_ms', 'time_signature',
       'Track_uri', 'Track_release_date', 'Track_pop', 'Artist_uri',
       'Artist_pop', 'Artist_genres'],
      dtype='object')

## Handling audio_features missing From extraction

In [24]:
missing_t_uri=df.track_uri[df.id.isna()]
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)

In [25]:
f = open('data/audio_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
     track_feature = sp.audio_features(missing_t_uri[i:i+1])
     track_df = pd.DataFrame(track_feature)
     csv_data = track_df.to_csv(header=False,index=False)
     f.write(csv_data)
    except Exception as e:
        r = open("extract_log0.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

100%|█████████████████████████████████████████████| 4/4 [00:01<00:00,  2.02it/s]


## Handling track_features missing From extraction

In [26]:
missing_t_uri=df.track_uri[df.Track_uri.isna()]
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)

In [27]:
f = open('data/track_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
        track_features = sp.tracks(missing_t_uri[i:i+1])
        for x in range(1):
            track_pop=pd.DataFrame([missing_t_uri[i+x]])
            track_pop['release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_pop.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as e:
        r = open("extract_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

100%|█████████████████████████████████████████| 388/388 [01:42<00:00,  3.78it/s]


# Droping Unwanted Columns Save Space

There were still 101 from audio_features and 576 from track_features extraction that were missing from the soptify api, so I had to drop them.

In [28]:
df.dropna(axis=0,inplace=True)

In [29]:
df.isna().sum().sum()

0

In [30]:
df.columns

Index(['Unnamed: 0', 'track_uri', 'artist_uri', 'album_uri', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri',
       'track_href', 'analysis_url', 'duration_ms', 'time_signature',
       'Track_uri', 'Track_release_date', 'Track_pop', 'Artist_uri',
       'Artist_pop', 'Artist_genres'],
      dtype='object')

In [31]:
df.drop(columns=['Unnamed: 0', 'Track_uri','Artist_uri','type','id','uri','track_href',
                 'analysis_url'],axis=1,inplace=True)

In [32]:
df.head(1)

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,Artist_genres
0,3uvsVUrAaGQJCTEUR1S3Sx,0ypTT9UqAU5sZpPo5JZmjR,2kIoMj5Ht14l2PnNRa2abC,0.523,0.422,3,-10.385,0,0.027,0.783,0.00514,0.0898,0.145,129.876,242060,4,2016-02-22,0,35,british_singer-songwriter


In [33]:
df.shape

(300528, 20)

## Data Preprocessing

Create five point buckets for track and artist popularity .

and 50 point buckets for the track release date.

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300528 entries, 0 to 300917
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   track_uri           300528 non-null  object
 1   artist_uri          300528 non-null  object
 2   album_uri           300528 non-null  object
 3   danceability        300528 non-null  object
 4   energy              300528 non-null  object
 5   key                 300528 non-null  object
 6   loudness            300528 non-null  object
 7   mode                300528 non-null  object
 8   speechiness         300528 non-null  object
 9   acousticness        300528 non-null  object
 10  instrumentalness    300528 non-null  object
 11  liveness            300528 non-null  object
 12  valence             300528 non-null  object
 13  tempo               300528 non-null  object
 14  duration_ms         300528 non-null  object
 15  time_signature      300528 non-null  object
 16  Tr

In [36]:
df['Track_pop'] = df['Track_pop'].apply(lambda x: int(x)/5)
df['Artist_pop'] = df['Artist_pop'].apply(lambda x: int(x)/5)

In [37]:
df['Track_release_date'] = df['Track_release_date'].apply(lambda x: x.split('-')[0])
df['Track_release_date']=df['Track_release_date'].astype('int16')
df['Track_release_date'] = df['Track_release_date'].apply(lambda x: int(x/50))

In [38]:
df.head(1)

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,Artist_genres
0,3uvsVUrAaGQJCTEUR1S3Sx,0ypTT9UqAU5sZpPo5JZmjR,2kIoMj5Ht14l2PnNRa2abC,0.523,0.422,3,-10.385,0,0.027,0.783,0.00514,0.0898,0.145,129.876,242060,4,40,0.0,7.0,british_singer-songwriter


In [39]:
df.shape

(300528, 20)

In [40]:
df.to_csv('data/1M_unique_processed_data.csv',index=False)