In [1]:
# imports
import pandas as pd

In [2]:
# read in billboard 2018-2020 data
path = '../data/billboardSongsWeekly.csv'
billboard_df = pd.read_csv(path)
billboard_df.head()

Unnamed: 0,url,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart
0,http://www.billboard.com/charts/hot-100/1963-0...,6/1/1963,11,Still,Bill Anderson,StillBill Anderson,1.0,17.0,11.0,8.0
1,http://www.billboard.com/charts/hot-100/1967-0...,1/7/1967,11,Coming Home Soldier,Bobby Vinton,Coming Home SoldierBobby Vinton,1.0,17.0,11.0,8.0
2,http://www.billboard.com/charts/hot-100/1971-0...,7/3/1971,11,She's Not Just Another Woman,The 8th Day,She's Not Just Another WomanThe 8th Day,1.0,17.0,11.0,8.0
3,http://www.billboard.com/charts/hot-100/1975-1...,11/29/1975,11,Saturday Night,Bay City Rollers,Saturday NightBay City Rollers,1.0,17.0,11.0,8.0
4,http://www.billboard.com/charts/hot-100/1979-0...,9/29/1979,11,Pop Muzik,M,Pop MuzikM,1.0,17.0,11.0,8.0


In [3]:
# checking for null values
billboard_df.isnull().sum()

url                           0
WeekID                        0
Week Position                 0
Song                          0
Performer                     0
SongID                        0
Instance                   5200
Previous Week Position    35984
Peak Position              5200
Weeks on Chart             5200
dtype: int64

In [4]:
# checking to see range of instance values
billboard_df['Instance'].unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8., nan,  9.])

In [5]:
# replacing nans with 0
billboard_df = billboard_df.fillna(0)

In [6]:
# re-checking for null values
billboard_df.isnull().sum()

url                       0
WeekID                    0
Week Position             0
Song                      0
Performer                 0
SongID                    0
Instance                  0
Previous Week Position    0
Peak Position             0
Weeks on Chart            0
dtype: int64

In [7]:
# change weekid col to datetime
billboard_df["WeekID"] = pd.to_datetime(billboard_df["WeekID"])

In [8]:
# changing unnecessary floats to integers
billboard_df = billboard_df.astype({"Instance": int, "Previous Week Position": int, "Peak Position": int, "Weeks on Chart": int})

In [9]:
# checking data types
billboard_df.dtypes

url                               object
WeekID                    datetime64[ns]
Week Position                      int64
Song                              object
Performer                         object
SongID                            object
Instance                           int64
Previous Week Position             int64
Peak Position                      int64
Weeks on Chart                     int64
dtype: object

In [10]:
# removing data from before 2018
bb_subset = billboard_df[billboard_df['WeekID'].dt.date.astype(str) >= '2017-12-31']
bb_subset.head()

Unnamed: 0,url,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart
17,https://www.billboard.com/charts/hot-100/2019-...,2019-12-07,11,Dance Monkey,Tones And I,Dance MonkeyTones And I,1,19,11,8
28,https://www.billboard.com/charts/hot-100/2018-...,2018-04-07,12,Pray For Me,The Weeknd & Kendrick Lamar,Pray For MeThe Weeknd & Kendrick Lamar,1,10,7,8
322,https://www.billboard.com/charts/hot-100/2019-...,2019-12-07,55,Into The Unknown,Idina Menzel & AURORA,Into The UnknownIdina Menzel & AURORA,1,0,55,1
373,https://www.billboard.com/charts/hot-100/2018-...,2018-02-03,56,Mine,Bazzi,MineBazzi,1,0,56,1
374,https://www.billboard.com/charts/hot-100/2018-...,2018-03-24,56,Indica Badu,Logic Featuring Wiz Khalifa,Indica BaduLogic Featuring Wiz Khalifa,1,0,56,1


In [11]:
# checking shape of df
bb_subset.shape

(15600, 10)

In [12]:
# reset index
bb_subset = bb_subset.reset_index(drop=True)
bb_subset.head()

Unnamed: 0,url,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart
0,https://www.billboard.com/charts/hot-100/2019-...,2019-12-07,11,Dance Monkey,Tones And I,Dance MonkeyTones And I,1,19,11,8
1,https://www.billboard.com/charts/hot-100/2018-...,2018-04-07,12,Pray For Me,The Weeknd & Kendrick Lamar,Pray For MeThe Weeknd & Kendrick Lamar,1,10,7,8
2,https://www.billboard.com/charts/hot-100/2019-...,2019-12-07,55,Into The Unknown,Idina Menzel & AURORA,Into The UnknownIdina Menzel & AURORA,1,0,55,1
3,https://www.billboard.com/charts/hot-100/2018-...,2018-02-03,56,Mine,Bazzi,MineBazzi,1,0,56,1
4,https://www.billboard.com/charts/hot-100/2018-...,2018-03-24,56,Indica Badu,Logic Featuring Wiz Khalifa,Indica BaduLogic Featuring Wiz Khalifa,1,0,56,1


In [14]:
# renaming cols
bb_subset.columns = ['url', 'date', 'chart_position', 'song', 'performer', 'songID', 'instance', 'previous_position', 'peak', 'weeks_on_chart']
bb_subset.head()

Unnamed: 0,url,date,chart_position,song,performer,songID,instance,previous_position,peak,weeks_on_chart
0,https://www.billboard.com/charts/hot-100/2019-...,2019-12-07,11,Dance Monkey,Tones And I,Dance MonkeyTones And I,1,19,11,8
1,https://www.billboard.com/charts/hot-100/2018-...,2018-04-07,12,Pray For Me,The Weeknd & Kendrick Lamar,Pray For MeThe Weeknd & Kendrick Lamar,1,10,7,8
2,https://www.billboard.com/charts/hot-100/2019-...,2019-12-07,55,Into The Unknown,Idina Menzel & AURORA,Into The UnknownIdina Menzel & AURORA,1,0,55,1
3,https://www.billboard.com/charts/hot-100/2018-...,2018-02-03,56,Mine,Bazzi,MineBazzi,1,0,56,1
4,https://www.billboard.com/charts/hot-100/2018-...,2018-03-24,56,Indica Badu,Logic Featuring Wiz Khalifa,Indica BaduLogic Featuring Wiz Khalifa,1,0,56,1


In [16]:
# reordering cols
bb_subset = bb_subset[['song', 'performer', 'date', 'chart_position','previous_position', 'peak', 'weeks_on_chart', 'instance', 'songID', 'url']]
bb_subset.head()

Unnamed: 0,song,performer,date,chart_position,previous_position,peak,weeks_on_chart,instance,songID,url
0,Dance Monkey,Tones And I,2019-12-07,11,19,11,8,1,Dance MonkeyTones And I,https://www.billboard.com/charts/hot-100/2019-...
1,Pray For Me,The Weeknd & Kendrick Lamar,2018-04-07,12,10,7,8,1,Pray For MeThe Weeknd & Kendrick Lamar,https://www.billboard.com/charts/hot-100/2018-...
2,Into The Unknown,Idina Menzel & AURORA,2019-12-07,55,0,55,1,1,Into The UnknownIdina Menzel & AURORA,https://www.billboard.com/charts/hot-100/2019-...
3,Mine,Bazzi,2018-02-03,56,0,56,1,1,MineBazzi,https://www.billboard.com/charts/hot-100/2018-...
4,Indica Badu,Logic Featuring Wiz Khalifa,2018-03-24,56,0,56,1,1,Indica BaduLogic Featuring Wiz Khalifa,https://www.billboard.com/charts/hot-100/2018-...


In [18]:
path2 = '../data/tracks20102021.csv'
tracks = pd.read_csv(path2)
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['The Toys'],['6lH5PpuiMa5SpfjoIOlwCS'],3/13/2020,0.671,0.867,2,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,4
1,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],5/4/2018,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3
2,19oquvXf3bc65GSqtPYA5S,It Was A Very Good Year - Live At The Sands Ho...,25,236800,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],5/4/2018,0.269,0.129,7,-18.168,0,0.0576,0.938,5e-06,0.683,0.16,82.332,3
3,55qyghODi24yaDgKBI6lx0,"The Circle Game - Live at The 2nd Fret, Philad...",18,313093,0,['Joni Mitchell'],['5hW4L92KnC6dX9t7tYM4Ve'],10/30/2020,0.644,0.212,11,-14.118,1,0.0347,0.881,2.2e-05,0.798,0.441,117.072,3
4,00xemFYjQNRpOlPhVaLAHa,"Urge For Going - Live at The 2nd Fret, Philade...",18,295093,0,['Joni Mitchell'],['5hW4L92KnC6dX9t7tYM4Ve'],10/30/2020,0.627,0.184,1,-15.533,1,0.045,0.955,0.000162,0.0986,0.299,115.864,4
