# Preparing the "Basic" Billboard Dataset
#### 11/01/23
In this dataset, I use the previously made JSON file to create the basic dataset.

## Reading in the data

In [1]:
import pandas as pd
import json

In [3]:
def load_from_json(file_name):
    with open('all_playlist_data.json', 'r', encoding='utf-8') as f:
        return json.load(f)

all_playlist_data = load_from_json('all_playlist_data.json')

In [4]:
def get_track_info(track_data):
    track_id = track_data['id']
    title = track_data['name']
    artist = track_data['artists'][0]['name']
    album = track_data['album']['name']
    release_date = track_data['album']['release_date']
    popularity = track_data['popularity']
    duration_ms = track_data['duration_ms']
    explicit = track_data['explicit']
    
    return track_id, title, artist, album, release_date, popularity, duration_ms, explicit

In [5]:
def get_playlist_info(playlist_items):
    return [get_track_info(playlist_items[i]['track']) for i in range(len(playlist_items))]

In [6]:
all_songs_data = []
for i in range(23):
    all_songs_data += get_playlist_info(all_playlist_data[i]['items'])

In [7]:
df = pd.DataFrame(all_songs_data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,3y4LxiYMgDl4RethdzpmNe,Breathe,Faith Hill,Breathe,1999-11-09,69,250546,False
1,0n2SEXB2qoRQg171q7XqeW,Smooth (feat. Rob Thomas),Santana,Supernatural (Remastered),1999-06-15,73,294986,False
2,3XKIUb7HzIF1Vu9usunMzc,Maria Maria (feat. The Product G&B),Santana,Supernatural (Remastered),1999-06-15,79,261973,False
3,1m2xMsxbtxv21Brome189p,I Wanna Know,Joe,My Name Is Joe,2000-04-18,69,296693,False
4,4cKGldbhGJniI8BrB3K6tb,Everything You Want,Vertical Horizon,Everything You Want,1999-06-14,64,257066,False


## Pre-processing
All we have to do now is name the columns.

In [8]:
df.columns = ['track_id', 'title', 'artist', 'album', 'release_date', 'popularity', 'duration_ms', 'explicit']
df.head()

Unnamed: 0,track_id,title,artist,album,release_date,popularity,duration_ms,explicit
0,3y4LxiYMgDl4RethdzpmNe,Breathe,Faith Hill,Breathe,1999-11-09,69,250546,False
1,0n2SEXB2qoRQg171q7XqeW,Smooth (feat. Rob Thomas),Santana,Supernatural (Remastered),1999-06-15,73,294986,False
2,3XKIUb7HzIF1Vu9usunMzc,Maria Maria (feat. The Product G&B),Santana,Supernatural (Remastered),1999-06-15,79,261973,False
3,1m2xMsxbtxv21Brome189p,I Wanna Know,Joe,My Name Is Joe,2000-04-18,69,296693,False
4,4cKGldbhGJniI8BrB3K6tb,Everything You Want,Vertical Horizon,Everything You Want,1999-06-14,64,257066,False


In [9]:
df['year'] = df.index // 100 + 2000
df['ranking'] = df.index % 100 + 1
df

Unnamed: 0,track_id,title,artist,album,release_date,popularity,duration_ms,explicit,year,ranking
0,3y4LxiYMgDl4RethdzpmNe,Breathe,Faith Hill,Breathe,1999-11-09,69,250546,False,2000,1
1,0n2SEXB2qoRQg171q7XqeW,Smooth (feat. Rob Thomas),Santana,Supernatural (Remastered),1999-06-15,73,294986,False,2000,2
2,3XKIUb7HzIF1Vu9usunMzc,Maria Maria (feat. The Product G&B),Santana,Supernatural (Remastered),1999-06-15,79,261973,False,2000,3
3,1m2xMsxbtxv21Brome189p,I Wanna Know,Joe,My Name Is Joe,2000-04-18,69,296693,False,2000,4
4,4cKGldbhGJniI8BrB3K6tb,Everything You Want,Vertical Horizon,Everything You Want,1999-06-14,64,257066,False,2000,5
...,...,...,...,...,...,...,...,...,...,...
2295,2ccuOtUjIyx3tPcsnpeBzJ,Flower Shops (feat. Morgan Wallen),ERNEST,Flower Shops (feat. Morgan Wallen),2021-12-31,72,214405,False,2022,96
2296,5vUnjhBzRJJIAOJPde6zDx,TO THE MOON,JNR CHOI,TO THE MOON,2021-11-06,72,152137,True,2022,97
2297,3nqQXoyQOWXiESFLlDF1hG,Unholy (feat. Kim Petras),Sam Smith,Unholy (feat. Kim Petras),2022-09-22,86,156943,False,2022,98
2298,5ekA7j4MPQa3NZbZQSpRfF,One Mississippi,Kane Brown,Different Man,2022-09-09,62,214293,False,2022,99


In [71]:
df.to_csv('billboard_basic.csv', index=False)

## Next steps

Now we have the basic dataset. Next, we'll make the advanced dataset, which will include audio features.