<center> <h1> FP2 - Creating the Dataset </h1> </center>

<center> <h3> Glenn Billman, Shreya Yalamanchili, Sam Zlota </h3></center>

In the below code, we will first create a dataframe with 10,000 random songs using a random search function. Using the Spotify Song IDs of the songs, we will then access the API and get the features we are conducting our project on. We will then add all of these feeatures and the song information to a final dataframe and convert that to a CSV file.

In [None]:
# adapted from https://stmorse.github.io/journal/spotify-api.html and 
# https://github.com/ZipBomb/spotify-song-suggestion/blob/master/random_song.py

In [None]:
# imports the required libaries
import base64
import json
import random
import urllib
import requests
from tqdm import tqdm
import pandas as pd

# Spotify API client id and secret id
CLIENT_ID = '6480491e23264b429963178e13b36b1e'
CLIENT_SECRET = 'cbd284788bf846b6bf7fd2b870fa1da5'

# Spotify API URIs
SPOTIFY_TOKEN_URL = "https://accounts.spotify.com/api/token"
SPOTIFY_API_BASE_URL = "https://api.spotify.com"
API_VERSION = "v1"
SPOTIFY_API_URL = "{}/{}".format(SPOTIFY_API_BASE_URL, API_VERSION)
words_txt = pd.read_csv('word_list.txt',header=None)
word_list = words_txt[0].to_list()

# gets a token to access the API with our client id and secret id
def get_token():
    client_token = base64.b64encode("{}:{}".format(CLIENT_ID, CLIENT_SECRET).encode('UTF-8')).decode('ascii')
    headers = {"Authorization": "Basic {}".format(client_token)}
    payload = {"grant_type": "client_credentials"}
    token_request = requests.post(SPOTIFY_TOKEN_URL, data=payload, headers=headers)
    access_token = json.loads(token_request.text)["access_token"]
    return access_token

# requests a random valid song from Spotify
def request_valid_song(access_token, iterations):
    # creates a dataframe with the song information and search term
    song_df = pd.DataFrame(columns=['track','artist','id','search term'])
    songs = 0 # sets the count to 0 
    pbar = tqdm(total=iterations,position=0, leave=True) # sets a progress bar

    while songs < iterations:
    
        # search term for random search
        search_term = random.choice(word_list)

        # make a request for the Search API with pattern and random index
        authorization_header = {"Authorization": "Bearer {}".format(access_token)}

        # cap the max number of requests
        for i in range(10):
            try:
                song_request = requests.get(
                    '{}/search?q={}&type=track&offset={}'.format(
                        SPOTIFY_API_URL,
                        search_term,
                        random.randint(0, 200)
                    ),
                    headers = authorization_header
                )
                song_info = random.choice(json.loads(song_request.text)['tracks']['items'])
                artist = song_info['artists'][0]['name']
                song = song_info['name']
                song_id = song_info['id']
                break
            except: # not all words will bring up a song, so just continue on 
                continue
                
        # sets the song we just searched
        song_to_append = pd.DataFrame([[song,artist,song_id,search_term]],columns=['track','artist','id',
                                                                                   'search term'])
        
        # appends the new song to the dataframe
        song_df = song_df.append(song_to_append,ignore_index = True)

        # increases the count and the progress bar
        songs+=1
        pbar.update(1)

    pbar.close()
    return song_df

In [None]:
songs_1000 = request_valid_song(get_token(),1000) # get 1000 songs

In [None]:
songs_10000 = request_valid_song(get_token(),10000) # get 10000 songs

In [None]:
# convert both dataframes to csv files
songs_10000.to_csv('10000_songs.csv')
songs_1000.to_csv('1000_songs.csv')

In [None]:
# gets the song traids (attributes) of the song ids given a dataframe with the ids
def get_traits(df):
    AUTH_URL = 'https://accounts.spotify.com/api/token'

    # POST
    auth_response = requests.post(AUTH_URL, {
        'grant_type': 'client_credentials',
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
    })

    # convert the response to JSON
    auth_response_data = auth_response.json()

    # save the access token
    access_token = auth_response_data['access_token']
    
    headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
    }
    
    # base URL of all Spotify API endpoints
    BASE_URL = 'https://api.spotify.com/v1/'

    # creates a dataframe with all of our features
    attr_df = pd.DataFrame(columns=['artist','track','id','danceability','energy','key','loudness','mode',
                                           'speechiness','acousticness','instrumentalness','liveness',
                                           'valence','tempo','duration_ms'])
    pbar = tqdm(total=len(df['id']),position=0, leave=True)
    
    # for each song, get the attributes
    for i in range(len(df['id'])): 
        try:
            track_id = df['id'][i]
            r = requests.get(BASE_URL + 'audio-features/' + track_id, headers=headers)
            r = r.json()
            
            # store the new attributes in a dataframe
            attr_to_append = pd.DataFrame([[df['artist'][i],df['track'][i],df['id'][i], r['danceability'],r['energy'], 
                                          r['key'],r['loudness'],r['mode'], r['speechiness'],r['acousticness'],
                                          r['instrumentalness'],r['liveness'],r['valence'],r['tempo'],r['duration_ms']]],
                                          columns=['artist','track','id','danceability','energy','key','loudness','mode',
                                                   'speechiness','acousticness','instrumentalness','liveness',
                                                    'valence','tempo','duration_ms'])
            # append the new attributes to the dataframe
            attr_df = attr_df.append(attr_to_append,ignore_index = True)
            pbar.update(1)
        
        except: # sometimes attributes will not be found
            # print the song id, update the progress bar, and continue on 
            print(df['id'][i])
            pbar.update(1)
            continue 
        
    pbar.close()
    return attr_df


# gets the popularity index of the song ids given a dataframe with the ids
def get_popularity(df):
    AUTH_URL = 'https://accounts.spotify.com/api/token'

    # POST
    auth_response = requests.post(AUTH_URL, {
        'grant_type': 'client_credentials',
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
    })

    # convert the response to JSON
    auth_response_data = auth_response.json()

    # save the access token
    access_token = auth_response_data['access_token']
    
    headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
    }
    
    # base URL of all Spotify API endpoints
    BASE_URL = 'https://api.spotify.com/v1/'

    # create a dataframe to store the indexes
    attr_df = pd.DataFrame(columns=['id','popularity'])
    pbar = tqdm(total=len(df['id']),position=0, leave=True)
    
    # for every song, get the popularity index
    for i in range(len(df['id'])): 
        try:
            track_id = df['id'][i]
            r = requests.get(BASE_URL + 'tracks/' + track_id, headers=headers)
            r = r.json()

            # store the index we found in a few dataframe
            attr_to_append = pd.DataFrame([[df['id'][i], r['popularity']]],
                                          columns=['id','popularity'])
            # append the index we found to the dataframe
            attr_df = attr_df.append(attr_to_append,ignore_index = True)
            pbar.update(1)
        
        except: # sometimes the index will not be found
            # print the song id, update the progress bar, and continue on 
            print(df['id'][i])
            pbar.update(1)
            continue 
        
    pbar.close()
    return attr_df

In [None]:
traits_10000 = get_traits(songs_10000) # gets the attributes of 100000 songs

In [None]:
traits_10000.to_csv('traits_10000.csv') # converts the dataframe to a csv

In [None]:
pop_10000 = get_popularity(songs_10000) # gets the popularity index of each song

In [None]:
both_songs_10000 = pd.merge(traits_10000, pop_10000, how='left', on='id') # merges the two dataframes

In [None]:
both_songs_10000.to_csv('song_info_10000.csv') # converts the final dataframe to a csv