In [56]:
# imports
import sys

sys.path.append('../')
from connection import create_spotify_oauth, get_audio_features, get_token, get_tracks, \
    tracks_to_df, audio_features_to_df, get_artist_info, artist_info_to_df, get_album_info, \
        album_info_to_df, get_similar_artists

from itertools import chain

import json
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, Normalizer
from sklearn.feature_extraction.text import FeatureHasher

In [57]:
# token and authentication variables

sp_ouath = create_spotify_oauth()
token_info = get_token()

In [58]:
# getting playlist items

playlist_source = '28wyIcESr22fGn3pgsruag' # playlist id for playlist

# playlists id's to play around with: 
### 5tsD40JmH6fgIrguJdJ8tk ### 1
### 2cyrnsrPJDDgas7fmDIXdf ### 2
### 2YWkEfOW4TG0njIeLqFk4B ### 3
### 37i9dQZF1DX1k0OdTn3ZMX ### 4
### 3RiN6RX7g47wYy9EJm4YUn ### 5
### 65rzvC5i0fhi6J3j1o7SiX ### 6
### 7AXPGPXhV2IvOQsrPCGQMo ### 7
### 7HYfyqo0x2tBkfisNvKL7J ### 8
### 2UhVgdqKVqf9a365y1uh3V ### 9 --> errors out at cell 5 (artists_to_df, index out range for artist image url)
### 0fHyPDc90zfYysAWUklSaO ### 10 --> same with this one
### 5EGXIeHfAZjPh1YeVLyG13 ### 11
### 7jfRuO7rW49RJnafrxsJeu ### 12 --> also errors out
### 7p0dA82cE8Rt5sEyRuE8Zy ### 13 
### 3SCpXIcMkI8uelOBQ8kvNC ### 14
### 02dSgnwgoovYRH6RlptgDC ### 15
### 34uqcr7M0AKT1yfyhnZuUf ### 16
### 2G1i9U0ZY1kY8UpgPOv8f7 ### 17 --> sigh again :/
### 28wyIcESr22fGn3pgsruag ### 18


tracks = get_tracks(playlist_source, token_info) # parses Spotipy.playlist_items() to fetch playlist items
tracks_df = tracks_to_df(tracks) # extracts relevant features from messy list into a df

# drops duplicate songs based on track ID, this is useful if a song gets added from single and 
# album release; or in the case of deluxe versions of albums, etc.
tracks_df.drop_duplicates(subset=['track_id'], inplace=True)

In [59]:
# getting tracks' features

tracks_features = []

for track in tracks_df['track_id']:
    track_af = get_audio_features(track, token_info) # parses Spotipy.audio_features() to fetch track features
    tracks_features.extend(track_af) # extends each track's features into the empty list init'd before
    
tracks_features_df = audio_features_to_df(tracks_features) # converts nested data from API call to simple df
tracks_df = pd.merge(tracks_df, tracks_features_df, how='left')


In [60]:
# getting tracks' artists

tracks_artists = []

for artist_id in tracks_df['artist_id']:
    for i in artist_id: # since one track can have multiple artists, loop over the list of artists to pull data for each one
        artist_info = get_artist_info(i, token_info)
        tracks_artists.append(artist_info)
        print(artist_id)

artists_df = artist_info_to_df(tracks_artists) # converts nested data to df

['0NbQe5CNgh4YApOCDuHSjb']
['2CmaKO2zEGJ1NWpS1yfVGz']
['4NiJW4q9ichVqL1aUsgGAN']
['3Uobr6LgQpBbk6k4QGAb3V']
['3Uobr6LgQpBbk6k4QGAb3V']
['6bu7CtcOMWcS0BMq7snHW6']
['2vd2HnNh4pdYa9gDVHFjEu']
['5ZvwJikDgdP1PFU4PkAPVG']
['2vd2HnNh4pdYa9gDVHFjEu']
['0OgdRTPItr9dw4XYp4JJUx']
['6bu7CtcOMWcS0BMq7snHW6']
['0v2e0oznnTaIXKLGKHphfX']
['6vXYoy8ouRVib302zxaxFF']
['6MwPCCR936cYfM1dLsGVnl', '6ZLcdPSxw1fTYJlz3DLrSZ']
['6MwPCCR936cYfM1dLsGVnl', '6ZLcdPSxw1fTYJlz3DLrSZ']
['6MwPCCR936cYfM1dLsGVnl']
['4UgQ3EFa8fEeaIEg54uV5b']
['0uNj4RxFjG0iVPlZS753en']
['3et9upNERQI5IYt5jEDTxM']
['0FZcPgWI3BsFQl4rOAGSHT']
['6HZr7Fs2VfV1PYHIwo8Ylc']
['3et9upNERQI5IYt5jEDTxM']
['2vd2HnNh4pdYa9gDVHFjEu']
['0uNj4RxFjG0iVPlZS753en']
['0OgdRTPItr9dw4XYp4JJUx']
['0OgdRTPItr9dw4XYp4JJUx', '0hFOSb4sdsDLsAhwwSheWH']
['0OgdRTPItr9dw4XYp4JJUx', '0hFOSb4sdsDLsAhwwSheWH']
['0OgdRTPItr9dw4XYp4JJUx']
['3Ri4H12KFyu98LMjSoij5V']
['72iCiKwu6nu6Qq9emIwzYv', '3Ri4H12KFyu98LMjSoij5V']
['72iCiKwu6nu6Qq9emIwzYv', '3Ri4H12KFyu98LMjSoij5V']
['3Ri4H

In [61]:
# with open('all_tracks_albums.json', 'w') as outfile:
#     json.dump(tracks_artists, outfile, indent=4)

In [62]:
# getting tracks' albums

tracks_albums = []

for album_id in tracks_df['album_id']:
    album_info = get_album_info(album_id, token_info)
    tracks_albums.append(album_info)

albums_df = album_info_to_df(tracks_albums) # converts nested data to df

In [63]:
tracks_df.to_csv('../../data/raw/p18_tracks.csv')
artists_df.to_csv('../../data/raw/p18_artists.csv')
albums_df.to_csv('../../data/raw/p18_albums.csv')

In [64]:
## REUSABLE TESTING CODE 

# with open('all_tracks_albums.json', 'w') as outfile:
#     json.dump(all_tracks_albums, outfile, indent=4)



# Data Documentation (to be reworked into seperate file)

"""
    Master dataframe documentation
    
    `master_df` is the master dataframe that contains all the tracks, audio features, artists, and albums data
    post-processing.
    
    Columns:
    - track_id              identifier for each track
    - track_name            name of each track
    - album_id              identifier for each album
    - artist_id             identifier for each artist
    - artist_number         number of artists that perform the track
    - explicit              True (1) or False (0) for explicit rating
    - song_popularity       popularity of a song, value between 0 and 100
    - preview_url           url link for 30 second preview of song
    - danceability          danceability measure based on tempo, rhythm, etc. value between 0.0 and 1.0
    - energy                measure of intentisty and activity, value between 0.0 and 1.0
    - key                   key that the track is in, value ranges from -1 (no key detected) to 11
    - loudness              overall loudness of the track in decibals, value typically betwene -60 and 0 db
    - mode                  binary measure of modality of the track, major is 1 and minor is 0
    - speechiness           presence of spoken words in the track, value ranges between 0.0 and 1.0
                                [0.66, 1.0] = track probably entirely spoken words
                                [0.33, 0.66) = track may contain both music and speech
                                [0.0, 0.33) = track probably is music and other non-speech items
    - acousticness          confidence measure whether track is acoustic, value ranges between 0.0 and 1.0
    - instrumentalness      predicts whether a track contains no vocals, value ranges [0.0, 1.0]. value above 0.5
                                meant to represent instrumental tracks, confidence is higher as value approaches 1.0 
    - liveness              detects presence of audience. value > 0.8 is strong likelihood track is live; range [0.0, 1.0]
    - valence               measure of musical positiveness in track, value ranges between 0.0 and 1.0
    - tempo                 overall estimated tempo of the track (BPM)
    - duration_ms           length of the track in milliseconds (ms) 
    - time_signature        estimated time signature of the track, value ranges between 3 and 7
    - index column          index column (not reset, so values not unique)
    - artist_name           name of artists performing the track
    - artist_followers      total number of followers of the artists
    - artist_popularity     popularity measure of artist, value [0, 100]. calcualted using popularity of all artist's tracks
    - artist_genres         genres associated with the artists
    - artist_img_300        300x300 image url of artist profile
    - album_name            name of album
    - album_type            type of album: single, album, compilation
    - album_label           label that own the rights to the album
    - album_popularity      popularity measure of album, value between 0 and 100
    - album_release_date    release date of album (YYYY-MM-DD), precision may vary by track, only year (YYYY) info is retained
    - album_tracks          number of tracks on the album
    - album_cover_640       640x640 image url of album cover
    - album_cover_300       300x300 image url of album cover
    - album_cover_64        64x64 image url of album cover      
    
    Notes:
    
    Tracks may have multiple artists associated with them. In such cases, the row is exploded so each row represents
    a single artist for the track, rest of row is duplicated and one track may be duplicated across rows.
    
    Artists may have multiple genres associated with them. In such cases, the rows are exploded are with the remainder 
    of values being duplicated, so each row will represent one artist and one genre. That's why some tracks will have several
    rows for exploded artists and genres.
    
    In the event that an artist has no genre associated (too new or too small), similar artists' genres are fetched and are
    used to impute the missing genres for the artists. In the event that even similar artists' genre imputing does not work
    (similar artists are also too small/too new), it is marked off as 'genre not imputable'.
"""