In [1]:
import pandas as pd
import requests
import io
from datetime import date, timedelta
import base64
import os
from collections import OrderedDict

In [2]:
start_date = date(2022, 1, 1)
end_date = date(2024, 8, 8)

current_date = start_date
billboard_dfs = []
while(current_date <= end_date):
    billboard_data = requests.get(f"https://raw.githubusercontent.com/utdata/rwd-billboard-data/main/data-scraped/hot-100/{current_date.year}/{current_date.strftime('%Y-%m-%d')}.csv").content
    current_billboard_df = pd.read_csv(io.StringIO(billboard_data.decode("utf-8"))).loc[:9]
    billboard_dfs.append(current_billboard_df)
    current_date += timedelta(days=7)

billboard_df = pd.concat(billboard_dfs).reset_index()[["chart_week", "current_week", "title", "performer"]]
billboard_df

Unnamed: 0,chart_week,current_week,title,performer
0,2022-01-01,1,All I Want For Christmas Is You,Mariah Carey
1,2022-01-01,2,Rockin' Around The Christmas Tree,Brenda Lee
2,2022-01-01,3,Jingle Bell Rock,Bobby Helms
3,2022-01-01,4,A Holly Jolly Christmas,Burl Ives
4,2022-01-01,5,Easy On Me,Adele
...,...,...,...,...
1355,2024-08-03,6,Too Sweet,Hozier
1356,2024-08-03,7,Please Please Please,Sabrina Carpenter
1357,2024-08-03,8,Lose Control,Teddy Swims
1358,2024-08-03,9,Beautiful Things,Benson Boone


In [3]:
client_id = "3cdad55fbddf484aa218ad245bad2c7c"
client_secret = "853507ba61bb42e2aae13ed935b548e4"
credentials = f"{client_id}:{client_secret}"
base64_credentials = base64.b64encode(credentials.encode()).decode()

auth_options = {
    "headers": {"Authorization": f"Basic {base64_credentials}"},
    "data": {"grant_type": "client_credentials"}
}
auth_response = requests.post("https://accounts.spotify.com/api/token", headers=auth_options["headers"], data=auth_options["data"]).json()
access_token = auth_response["access_token"]
access_token

'BQDoKR3wFB4aMUt7Y0hpnhZGX_wtQXpsZmqykPKYSntsHhqE9MiXUmlrWUf-D03aMCASCoA0K-VrXq04tnldlVy-J8BwnVXOo8EU-YkUxEht_qfHysY'

In [6]:
track_info = OrderedDict()
def find_track_id(song):
    name = song["title"]
    artist = song["performer"].split("X")[0].split("&")[0].strip()

    if((name, artist) not in track_info):
        search_response = requests.get(
            "https://api.spotify.com/v1/search",
            headers={"Authorization": f"Bearer {access_token}"},
            params={
                "q":  f"track:{name} artist:{artist}",
                "type": "track",
                "limit": 1
            }
        ).json()
        track_info[(name, artist)] = {"id": search_response["tracks"]["items"][0]["id"]}

billboard_df.loc[:100].apply(find_track_id, axis=1) # remove .loc[:100] in prod
len(track_info)

24

In [7]:
track_ids = [x["id"] for x in track_info.values()]
audio_features = []
for i in range(0, len(track_ids), 100):
    chunk = track_ids[i:i + 100]
    
    audio_features_response = requests.get(
        "https://api.spotify.com/v1/audio-features",
        headers={"Authorization": f"Bearer {access_token}"},
        params={"ids": ",".join(chunk)}
    ).json()

    audio_features += audio_features_response["audio_features"]

for i, k in enumerate(track_info.keys()):
    track_info[k] = audio_features[i]

track_info

OrderedDict([(('All I Want For Christmas Is You', 'Mariah Carey'),
              {'danceability': 0.336,
               'energy': 0.627,
               'key': 7,
               'loudness': -7.463,
               'mode': 1,
               'speechiness': 0.0384,
               'acousticness': 0.164,
               'instrumentalness': 0,
               'liveness': 0.0708,
               'valence': 0.35,
               'tempo': 150.273,
               'type': 'audio_features',
               'id': '0bYg9bo50gSsH3LtXe2SQn',
               'uri': 'spotify:track:0bYg9bo50gSsH3LtXe2SQn',
               'track_href': 'https://api.spotify.com/v1/tracks/0bYg9bo50gSsH3LtXe2SQn',
               'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0bYg9bo50gSsH3LtXe2SQn',
               'duration_ms': 241107,
               'time_signature': 4}),
             (("Rockin' Around The Christmas Tree", 'Brenda Lee'),
              {'danceability': 0.598,
               'energy': 0.47,
             

In [11]:
def add_feature(song, feature):
    name = song["title"]
    artist = song["performer"].split("X")[0].split("&")[0].strip()
    return track_info[(name, artist)][feature] if (name, artist) in track_info else None

billboard_df["danceability"] = billboard_df.apply(add_feature, args=("danceability",), axis=1)
billboard_df["energy"] = billboard_df.apply(add_feature, args=("energy",), axis=1)
billboard_df["loudness"] = billboard_df.apply(add_feature, args=("loudness",), axis=1)
billboard_df["speechiness"] = billboard_df.apply(add_feature, args=("speechiness",), axis=1)
billboard_df["acousticness"] = billboard_df.apply(add_feature, args=("acousticness",), axis=1)
billboard_df["liveness"] = billboard_df.apply(add_feature, args=("liveness",), axis=1)
billboard_df["tempo"] = billboard_df.apply(add_feature, args=("tempo",), axis=1)

billboard_df

Unnamed: 0,chart_week,current_week,title,performer,danceability,energy,loudness,speechiness,acousticness,liveness,tempo
0,2022-01-01,1,All I Want For Christmas Is You,Mariah Carey,0.336,0.627,-7.463,0.0384,0.164,0.0708,150.273
1,2022-01-01,2,Rockin' Around The Christmas Tree,Brenda Lee,0.598,0.470,-8.744,0.0496,0.617,0.5050,67.086
2,2022-01-01,3,Jingle Bell Rock,Bobby Helms,0.754,0.424,-8.463,0.0363,0.643,0.0652,119.705
3,2022-01-01,4,A Holly Jolly Christmas,Burl Ives,0.682,0.375,-13.056,0.0303,0.579,0.0760,140.453
4,2022-01-01,5,Easy On Me,Adele,0.604,0.366,-7.519,0.0282,0.578,0.1330,141.981
...,...,...,...,...,...,...,...,...,...,...,...
1355,2024-08-03,6,Too Sweet,Hozier,,,,,,,
1356,2024-08-03,7,Please Please Please,Sabrina Carpenter,,,,,,,
1357,2024-08-03,8,Lose Control,Teddy Swims,,,,,,,
1358,2024-08-03,9,Beautiful Things,Benson Boone,,,,,,,


In [13]:
gpr_data = requests.get("https://www.matteoiacoviello.com/gpr_files/data_gpr_daily_recent.xls").content

gpr_df = pd.DataFrame(pd.read_excel(io.BytesIO(gpr_data), index_col="date", parse_dates=True)) 
cleaned_gpr_df = gpr_df.loc["2022-01-01":][["GPRD", "GPRD_THREAT", "GPRD_ACT"]]
cleaned_gpr_df

Unnamed: 0_level_0,GPRD,GPRD_THREAT,GPRD_ACT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,63.096531,91.816795,31.451157
2022-01-02,37.857918,62.960087,0.000000
2022-01-03,52.343060,74.614098,14.909126
2022-01-04,88.963112,139.248215,10.434032
2022-01-05,96.482552,137.534164,82.444771
...,...,...,...
2024-08-01,139.343185,168.535599,164.171432
2024-08-02,195.215775,192.764542,279.758881
2024-08-03,106.857018,71.083969,142.037476
2024-08-04,74.719574,69.035179,99.319443
