# Track Data Collection Notebook

In [65]:
from os import getenv
from sqlalchemy import create_engine, update
from sqlalchemy.ext.declarative import declarative_base
from dotenv import load_dotenv
from sqlalchemy.orm import sessionmaker
import spotipy
from spotipy.client import SpotifyException
import pandas as pd
from collections import Counter
from gensim.utils import tokenize
import time
import tqdm
import json

load_dotenv()

uri = getenv('uri') # must match in the Spotify app dashboard
SPOTIFY_CLIENT_ID = getenv('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET = getenv('SPOTIFY_CLIENT_SECRET')
# cache_path = ('../.user_cache')
scope = 'playlist-modify-public user-library-read user-top-read'

spot_cc = spotipy.oauth2.SpotifyOAuth(
                                    username='',
                                    client_id=SPOTIFY_CLIENT_ID,
                                    client_secret=SPOTIFY_CLIENT_SECRET,
#                                     cache_path=cache_path,
                                    scope=scope,
                                    redirect_uri=uri
                                    )
spot = spotipy.Spotify(auth_manager=spot_cc)

## Gathering user playlists

In [7]:
# playlist IDs for 'spotify', '37t3cvb5u3o97hin4bsj40abw', 'dlanguren', 'gabriela_ayala19', 'rueics5ld3iok5kotetlsly8h'

playlist_ids = []
users_lst = ['spotify', '37t3cvb5u3o97hin4bsj40abw', 'dlanguren', 'gabriela_ayala19', 'rueics5ld3iok5kotetlsly8h']
for user in users_lst:
    playlists = spot.user_playlists(user)
    while playlists:
        for i, playlist in enumerate(playlists['items']):
            playlist_ids.append(playlist['id'])
        if playlists['next']:
            playlists = spot.next(playlists)
        else:
            playlists = None

In [8]:
len(playlist_ids)

1454

## Getting tracks from a playlists

In [9]:
trx = []
for i in playlist_ids:
    offset = 0
    while True:
        response = spot.playlist_tracks(i,
                                       offset=offset,
                                       fields='items.track.id')
        offset = offset + len(response['items'])
        if len(response['items']) == 0:
            break
        for j in response['items']:
            if j['track'] == None or j['track']['id'] == None:
                continue
            trx.append(j['track']['id'])

In [10]:
# removing Nonetype track IDs from the trx list

print(len(trx))
for k, trk in enumerate(trx):
    if trk == None:
        trx.pop(k)
        print(k, trk)
len(trx)

102495


102495

## Gathering track audio features

In [11]:
import sys
sys.path.insert(0, '../api/')
from spotify_users import UserData

In [14]:
le_test = UserData()
audio_fe = le_test.get_audio_features(track_ids=trx, spot_session=spot)

In [15]:
lst_val_lst = []
for vals in audio_fe:
    val_lst = []
    col_lst = []
    if vals == None:
        continue
    for _, val in vals.items():
        val_lst.append(val)
        col_lst.append(_)
    lst_val_lst.append(val_lst)
    

In [16]:
test_df = pd.DataFrame(lst_val_lst, columns=col_lst)
drop_col = ['uri', 'analysis_url', 'track_href', 'type']
test_df = test_df.drop(drop_col, axis=1)
test_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature
0,0.721,0.745,7,-3.508,0,0.0366,0.151,1e-06,0.372,0.699,90.988,6FuGJB290AQMAHTfjOYVaK,192745,4
1,0.736,0.802,0,-4.759,1,0.0864,0.468,0.0,0.094,0.675,144.005,7igeByaBM0MgGsgXtNxDJ7,172325,4
2,0.702,0.825,6,-3.787,0,0.0601,0.00883,0.0,0.0674,0.915,102.977,463CkQjx2Zk1yXoBuierM9,203064,4
3,0.709,0.548,10,-8.493,1,0.353,0.65,2e-06,0.133,0.543,83.995,1tkg4EHVoqnhR6iFEXb60y,160000,4
4,0.631,0.239,11,-7.071,0,0.0398,0.864,0.0,0.116,0.0927,79.859,4y4spB9m0Q6026KfkAvy9Q,149297,4


In [17]:
# Exporting dataframe of +100k songs to csv file
# test_df.to_csv(r'/Users/flanuer/Downloads/Lambda/Course_material/misc_datasets/11_10_100k_song_aud_feat.csv')

## Importing and combining data

In [99]:
latest_df = pd.read_csv('/Users/flanuer/Downloads/Lambda/Course_material/misc_datasets/100k_song_aud_feat.csv')#, index_col='id')
drop_col = ['uri', 'Unnamed: 0']
latest_df = latest_df.drop(drop_col, axis=1)
latest_df.head(3)

full_df = pd.read_csv('/Users/flanuer/Downloads/Lambda/Course_material/misc_datasets/100k_song_aud_feat.csv')#, index_col='id')
drop_col = ['uri', 'Unnamed: 0']
full_df = full_df.drop(drop_col, axis=1)
full_df.head(3)

drop_cols = ['track name', 'artist', 'uri', 'Unnamed: 0']
songs_100_df = pd.read_csv('/Users/flanuer/Downloads/Lambda/Course_material/misc_datasets/songs100k.csv')#, index_col='id')
songs_100_df = songs_100_df.drop(drop_cols, axis=1)
songs_100_df.head(3)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature
0,0.743,0.339,1,-7.678,1,0.409,0.00582,0.0,0.0812,0.118,203.927,2RM4jf1Xa9zPgMGRDiht8O,238373,4
1,0.846,0.557,8,-7.259,1,0.457,0.0244,0.0,0.286,0.371,159.009,1tHDG53xJNGsItRA3vfVgs,214800,4
2,0.603,0.723,9,-5.89,0,0.0454,0.025,0.0,0.0824,0.382,114.966,6Wosx2euFPMT14UXiWudMy,138913,4


In [100]:
# need to concat all of the dataframes without duplicate songs
first_df = pd.concat([full_df, songs_100_df, latest_df])
first_df.shape, full_df.shape, songs_100_df.shape, latest_df.shape

((338951, 14), (104144, 14), (130663, 14))

In [101]:
df = first_df.drop_duplicates(inplace=False, subset='id')
df.shape

(210124, 14)

In [105]:
# Exporting dataframe of +100k songs to csv file

# df.to_csv(r'/Users/flanuer/Downloads/Lambda/Course_material/misc_datasets/non_dup_200k_song_aud_feat.csv')