In [62]:
# imports
from connection import create_spotify_oauth, get_audio_features, get_token, get_tracks, \
    tracks_to_df, audio_features_to_df, get_artist_info, artist_info_to_df, get_album_info, \
        album_info_to_df, get_similar_artists

from itertools import chain

import json
import re

import pandas as pd
import numpy as np

In [67]:
# token and authentication variables

sp_ouath = create_spotify_oauth()
token_info = get_token()

In [3]:
# getting playlist items

playlist_source = '02dSgnwgoovYRH6RlptgDC' # playlist id for playlist

# playlists id's to play around with: 
### 5a4unKpRA7mYi2PAgKTuNW
### 7jfRuO7rW49RJnafrxsJeu
### 2YWkEfOW4TG0njIeLqFk4B
### 02dSgnwgoovYRH6RlptgDC
### 5EGXIeHfAZjPh1YeVLyG13
###
###

tracks = get_tracks(playlist_source, token_info) # parses Spotipy.playlist_items() to fetch playlist items
tracks_df = tracks_to_df(tracks) # extracts relevant features from messy list into a df

tracks_df.drop_duplicates(subset=['track_id'], inplace=True) # drops duplicate songs based on track ID, this is useful
# if a song gets added from single and album release; or in the case of deluxe versions of albums, etc.


In [4]:
# getting tracks' features

tracks_features = []

for track in tracks_df['track_id']:
    track_af = get_audio_features(track, token_info) # parses Spotipy.audio_features() to fetch track features
    tracks_features.extend(track_af) # extends each track's features into the empty list init'd before
    
tracks_features_df = audio_features_to_df(tracks_features) # converts nested data from API call to simple df


In [71]:
# getting tracks' artists

tracks_artists = []

for artist_id in tracks_df['artist_id']:
    for i in artist_id: # since one track can have multiple artists, loop over the list of artists to pull data for each one
        artist_info = get_artist_info(i, token_info)
        tracks_artists.append(artist_info)

artists_df = artist_info_to_df(tracks_artists) # converts nested data to df

In [6]:
# getting tracks' albums

tracks_albums = []

for album_id in tracks_df['album_id']:
    album_info = get_album_info(album_id, token_info)
    tracks_albums.append(album_info)

albums_df = album_info_to_df(tracks_albums) # converts nested data to df

In [72]:
# transforming artist genres

# lambda converts the columns into strings and strips of the exterior [], then splits the string into a list based on ','
artists_df['artist_genres'] = artists_df['artist_genres'].apply(lambda x: re.sub(r"[\[\]']", '', str(x)).split(','))

# unnest the list of genres into seperate, rest of features are duplicated
artists_df = artists_df.explode('artist_genres')

In [73]:
# getting genres to impute for artists without genres

# creates a copy of artist_dfs where there are no genres then drops them from the original df
artists_missing_genres = artists_df.loc[artists_df['artist_genres'] == ""].copy()
artists_df = artists_df[artists_df['artist_genres'] != '']

# iterates through the rows of the dataframe
for index, artist in artists_missing_genres.iterrows():
    artist_id = artist['artist_id']
    similar_artists = get_similar_artists(artist_id, token_info) # API call
    
    # list comprehension that fetches the artist genres for every similar artist    
    genres_list = [similar_artist['genres'] for similar_artist in similar_artists if similar_artist['genres']]
    
# convert the loop output to list
flattened_genres = list(chain.from_iterable(genres_list))

# converting to set so only unique values remain in list and adding it to the artists_missing_genres df
artists_missing_genres.at[index, 'artist_genres'] = list(set(flattened_genres)) 

artists_missing_genres = artists_missing_genres.explode('artist_genres')

# if after imputation, there are still no genres, mark it off as not imputable
artists_missing_genres['artist_genres'].replace('','genre not imputable', inplace=True)

# concat the missing artist genres with the original df
artists_df = pd.concat([artists_df, artists_missing_genres], ignore_index=False)

# dropping  duplicate records, this occurs if a playlist has multiple tracks from the same artist
artists_df.drop_duplicates(subset=['artist_id', 'artist_genres'], inplace=True)
artists_df.reset_index(inplace=True)

In [9]:
## REUSABLE TESTING CODE 

# with open('all_tracks_albums.json', 'w') as outfile:
#     json.dump(all_tracks_albums, outfile, indent=4)

