In [2]:
# imports
import pandas as pd
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [3]:
# reading in 2021 csv
bb2021 = pd.read_csv('../data/newbb21.csv')
bb2021.head()

Unnamed: 0,song,performer,date,chart_position,previous_position,peak,weeks_on_chart,hitTF
0,Kings & Queens,Ava Max,2021-01-02,41,30,15,8,1
1,Champagne Night,Lady A,2021-01-02,80,0,33,6,1
2,Cry Baby,Megan Thee Stallion Featuring DaBaby,2021-01-02,67,95,28,18,1
3,Body,Megan Thee Stallion,2021-01-02,35,22,16,16,1
4,Last Christmas,Wham!,2021-01-02,9,14,9,1,1


In [4]:
# preparing spotify credentials
load_dotenv(dotenv_path='spotifyCred.env')
CLIENT_ID = os.getenv('SPOTIPY_CLIENT_ID')
CLIENT_SECRET = os.getenv('SPOTIPY_CLIENT_SECRET')

In [5]:
# connecting to spotipy
spotify = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
)

In [6]:
# from the 2021 dset df
# getting only the song titles
song_titles_bb = bb2021[['song', 'performer']]
song_titles_bb.loc[:, 'performer'] = song_titles_bb.loc[:, 'performer'].apply(lambda x: x.split('Featuring')[0])
song_titles_bb.loc[:, 'performer'] = song_titles_bb.loc[:, 'performer'].apply(lambda x: x.split('&')[0])
song_titles_bb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,song,performer
0,Kings & Queens,Ava Max
1,Champagne Night,Lady A
2,Cry Baby,Megan Thee Stallion
3,Body,Megan Thee Stallion
4,Last Christmas,Wham!
...,...,...
1795,Gone,Dierks Bentley
1796,Track Star,Mooski
1797,Go!,Moneybagg Yo
1798,Therefore I Am,Billie Eilish


In [7]:
# function to get spotify uris for each 2021 song
def get_song_uri(df, col_with_song_name):
    song_list = {
        'artist': [],
        'song': [],
        'uri': []
    }

    for i, row in df.iterrows():
        song = row[f'{col_with_song_name}']
    #     artist_from_df = row['performer'].lower()
        results = spotify.search(q=song, limit=5)['tracks']['items']
        for track in results:
            song_list['artist'].append(track['artists'][0]['name'])
            song_list['song'].append(track['name'])
            song_list['uri'].append(track['uri'])
            
    return song_list

In [8]:
# applying uri funct to 2021
song_uri_df = pd.DataFrame(get_song_uri(song_titles_bb, 'song'))

In [9]:
# examining 2021 df w/ uris
song_uri_df.head()

Unnamed: 0,artist,song,uri
0,Ava Max,Kings & Queens,spotify:track:7a53HqqArd4b9NF4XAmlbI
1,Outkast,Ms. Jackson,spotify:track:0I3q5fE6wg7LIfHGngUTnV
2,Kidz Bop Kids,Kings & Queens,spotify:track:4wbEBmAhxOchkSdfltzavr
3,Sam Tinnesz,Play with Fire (feat. Yacht Money),spotify:track:7vguMCv8uVuZLiQJ156u3Z
4,Mat Kearney,Kings & Queens,spotify:track:0vy1K9FhCK8woHW7MKEcBG


In [10]:
# saving uri 2021 to csv
song_uri_df.to_csv('../data/bb_uri2021.csv')

In [11]:
# examining df shape
song_uri_df.shape

(8993, 3)

In [12]:
# getting features for each song
features = {
    'song': [],
    'artist': [],
    'features': []
}
# let code run till a break
for i, row in song_uri_df.iterrows():
    uri = row['uri']
    audio_features = spotify.audio_features(uri)[0]
    features['song'].append(row['song'])
    features['artist'].append(row['artist'])
    features['features'].append(audio_features)

In [14]:
len(features['song'])

8993

In [17]:
m = pd.DataFrame(features)
m.tail()

Unnamed: 0,song,artist,features
8988,pov,Ariana Grande,"{'danceability': 0.487, 'energy': 0.534, 'key'..."
8989,Pov,June3rd,"{'danceability': 0.871, 'energy': 0.667, 'key'..."
8990,pov,Ariana Grande,"{'danceability': 0.474, 'energy': 0.533, 'key'..."
8991,POV (feat. Rubi Rose),Wiz Khalifa,"{'danceability': 0.857, 'energy': 0.588, 'key'..."
8992,pov,Ariana Grande,"{'danceability': 0.487, 'energy': 0.534, 'key'..."


In [16]:
m.shape

(8993, 3)

In [53]:
# following code cells in event of first attempt at getting audio features exceeding api limits
# saving the first output before error to another variable
# first_bit = features

In [None]:
# following code cells in event of first attempt at getting audio features exceeding api limits
# getting features for each song, II
# features2 = {
#     'song': [],
#     'artist': [],
#     'features': []
# }

# for i, row in song_uri_df[5315:].iterrows():
#     uri = row['uri']
#     audio_features = spotify.audio_features(uri)[0]
#     features2['song'].append(row['song'])
#     features2['artist'].append(row['artist'])
#     features2['features'].append(audio_features)

In [58]:
# following code cells in event of first attempt at getting audio features exceeding api limits
# saving second output to another variable
# second_bit = features2 

In [62]:
# following code cells in event of first attempt at getting audio features exceeding api limits
# saving the chunks to indiv dfs for concatenation
# f = pd.DataFrame(first_bit)
# s = pd.DataFrame(second_bit)

In [None]:
# following code cells in event of first attempt at getting audio features exceeding api limits
# concatting the 2 dfs to get all the features for the 2021 dataset
# bb_features = pd.concat([f,s])

In [18]:
# renaming features output, renaming cols to fit with other dfs
bb_features = m
bb_features = bb_features.rename(columns={'artist': 'performer'})
bb_features

Unnamed: 0,song,performer,features
0,Kings & Queens,Ava Max,"{'danceability': 0.637, 'energy': 0.69, 'key':..."
1,Ms. Jackson,Outkast,"{'danceability': 0.843, 'energy': 0.806, 'key'..."
2,Kings & Queens,Kidz Bop Kids,"{'danceability': 0.746, 'energy': 0.793, 'key'..."
3,Play with Fire (feat. Yacht Money),Sam Tinnesz,"{'danceability': 0.573, 'energy': 0.539, 'key'..."
4,Kings & Queens,Mat Kearney,"{'danceability': 0.621, 'energy': 0.61, 'key':..."
...,...,...,...
8988,pov,Ariana Grande,"{'danceability': 0.487, 'energy': 0.534, 'key'..."
8989,Pov,June3rd,"{'danceability': 0.871, 'energy': 0.667, 'key'..."
8990,pov,Ariana Grande,"{'danceability': 0.474, 'energy': 0.533, 'key'..."
8991,POV (feat. Rubi Rose),Wiz Khalifa,"{'danceability': 0.857, 'energy': 0.588, 'key'..."


In [19]:
# expanding the features col into separate cols
feats = bb_features['features'].apply(pd.Series)

In [20]:
# recombining the features into the 2021 features df
bb_features_df = pd.concat([bb_features, feats], axis=1).drop('features', axis=1)

In [21]:
bb_features_df.head()

Unnamed: 0,song,performer,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,Kings & Queens,Ava Max,0.637,0.69,1,-4.057,0,0.0405,0.00786,0.0,0.124,0.457,129.857,audio_features,7a53HqqArd4b9NF4XAmlbI,spotify:track:7a53HqqArd4b9NF4XAmlbI,https://api.spotify.com/v1/tracks/7a53HqqArd4b...,https://api.spotify.com/v1/audio-analysis/7a53...,162399,4
1,Ms. Jackson,Outkast,0.843,0.806,4,-5.946,0,0.269,0.143,0.0,0.0771,0.613,94.948,audio_features,0I3q5fE6wg7LIfHGngUTnV,spotify:track:0I3q5fE6wg7LIfHGngUTnV,https://api.spotify.com/v1/tracks/0I3q5fE6wg7L...,https://api.spotify.com/v1/audio-analysis/0I3q...,270507,4
2,Kings & Queens,Kidz Bop Kids,0.746,0.793,6,-4.105,0,0.0468,0.0138,0.0,0.0761,0.748,129.972,audio_features,4wbEBmAhxOchkSdfltzavr,spotify:track:4wbEBmAhxOchkSdfltzavr,https://api.spotify.com/v1/tracks/4wbEBmAhxOch...,https://api.spotify.com/v1/audio-analysis/4wbE...,146233,4
3,Play with Fire (feat. Yacht Money),Sam Tinnesz,0.573,0.539,2,-6.091,0,0.0322,0.0162,1.2e-05,0.105,0.39,75.012,audio_features,7vguMCv8uVuZLiQJ156u3Z,spotify:track:7vguMCv8uVuZLiQJ156u3Z,https://api.spotify.com/v1/tracks/7vguMCv8uVuZ...,https://api.spotify.com/v1/audio-analysis/7vgu...,180691,4
4,Kings & Queens,Mat Kearney,0.621,0.61,5,-8.455,1,0.0454,0.336,0.0,0.157,0.424,139.001,audio_features,0vy1K9FhCK8woHW7MKEcBG,spotify:track:0vy1K9FhCK8woHW7MKEcBG,https://api.spotify.com/v1/tracks/0vy1K9FhCK8w...,https://api.spotify.com/v1/audio-analysis/0vy1...,189987,4


In [22]:
# renaming cols in the uri 2021 df
song_uri_df = song_uri_df.rename(columns={'artist': 'performer'})

In [23]:
song_uri_df

Unnamed: 0,performer,song,uri
0,Ava Max,Kings & Queens,spotify:track:7a53HqqArd4b9NF4XAmlbI
1,Outkast,Ms. Jackson,spotify:track:0I3q5fE6wg7LIfHGngUTnV
2,Kidz Bop Kids,Kings & Queens,spotify:track:4wbEBmAhxOchkSdfltzavr
3,Sam Tinnesz,Play with Fire (feat. Yacht Money),spotify:track:7vguMCv8uVuZLiQJ156u3Z
4,Mat Kearney,Kings & Queens,spotify:track:0vy1K9FhCK8woHW7MKEcBG
...,...,...,...
8988,Ariana Grande,pov,spotify:track:3UoULw70kMsiVXxW0L3A33
8989,June3rd,Pov,spotify:track:6HtcV6gW2qOLR4GjJQGcBf
8990,Ariana Grande,pov,spotify:track:1bj8x3ERN9gSc2NfJIpc76
8991,Wiz Khalifa,POV (feat. Rubi Rose),spotify:track:3zjxHcCNv4mw0eoWq8Cumt


In [24]:
# making a copy of the original 2021 df
hits_test = bb2021.copy()
hits_test.head()

Unnamed: 0,song,performer,date,chart_position,previous_position,peak,weeks_on_chart,hitTF
0,Kings & Queens,Ava Max,2021-01-02,41,30,15,8,1
1,Champagne Night,Lady A,2021-01-02,80,0,33,6,1
2,Cry Baby,Megan Thee Stallion Featuring DaBaby,2021-01-02,67,95,28,18,1
3,Body,Megan Thee Stallion,2021-01-02,35,22,16,16,1
4,Last Christmas,Wham!,2021-01-02,9,14,9,1,1


In [25]:
# merging the original 2021 df with the newly obtained track features
test_merge = pd.merge(bb_features_df, hits_test)
test_merge.head()

Unnamed: 0,song,performer,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,track_href,analysis_url,duration_ms,time_signature,date,chart_position,previous_position,peak,weeks_on_chart,hitTF
0,Kings & Queens,Ava Max,0.637,0.69,1,-4.057,0,0.0405,0.00786,0.0,...,https://api.spotify.com/v1/tracks/7a53HqqArd4b...,https://api.spotify.com/v1/audio-analysis/7a53...,162399,4,2021-01-02,41,30,15,8,1
1,Kings & Queens,Ava Max,0.637,0.69,1,-4.057,0,0.0405,0.00786,0.0,...,https://api.spotify.com/v1/tracks/7a53HqqArd4b...,https://api.spotify.com/v1/audio-analysis/7a53...,162399,4,2021-01-09,16,41,15,8,1
2,Kings & Queens,Ava Max,0.637,0.69,1,-4.057,0,0.0405,0.00786,0.0,...,https://api.spotify.com/v1/tracks/7a53HqqArd4b...,https://api.spotify.com/v1/audio-analysis/7a53...,162399,4,2021-01-16,15,16,15,8,1
3,Kings & Queens,Ava Max,0.637,0.69,1,-4.057,0,0.0405,0.00786,0.0,...,https://api.spotify.com/v1/tracks/7a53HqqArd4b...,https://api.spotify.com/v1/audio-analysis/7a53...,162399,4,2021-01-23,28,15,15,8,1
4,Kings & Queens,Ava Max,0.637,0.69,1,-4.057,0,0.0405,0.00786,0.0,...,https://api.spotify.com/v1/tracks/7a53HqqArd4b...,https://api.spotify.com/v1/audio-analysis/7a53...,162399,4,2021-01-30,29,28,15,8,1


In [26]:
# removing and moving around cols
test_merge = test_merge[['song', 'performer', 'date', 'chart_position',
       'previous_position', 'peak', 'weeks_on_chart', 'hitTF', 
       'id', 'uri','danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 
       'duration_ms', 'time_signature']]

In [27]:
test_merge.head()

Unnamed: 0,song,performer,date,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,uri,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Kings & Queens,Ava Max,2021-01-02,41,30,15,8,1,7a53HqqArd4b9NF4XAmlbI,spotify:track:7a53HqqArd4b9NF4XAmlbI,...,-4.057,0,0.0405,0.00786,0.0,0.124,0.457,129.857,162399,4
1,Kings & Queens,Ava Max,2021-01-09,16,41,15,8,1,7a53HqqArd4b9NF4XAmlbI,spotify:track:7a53HqqArd4b9NF4XAmlbI,...,-4.057,0,0.0405,0.00786,0.0,0.124,0.457,129.857,162399,4
2,Kings & Queens,Ava Max,2021-01-16,15,16,15,8,1,7a53HqqArd4b9NF4XAmlbI,spotify:track:7a53HqqArd4b9NF4XAmlbI,...,-4.057,0,0.0405,0.00786,0.0,0.124,0.457,129.857,162399,4
3,Kings & Queens,Ava Max,2021-01-23,28,15,15,8,1,7a53HqqArd4b9NF4XAmlbI,spotify:track:7a53HqqArd4b9NF4XAmlbI,...,-4.057,0,0.0405,0.00786,0.0,0.124,0.457,129.857,162399,4
4,Kings & Queens,Ava Max,2021-01-30,29,28,15,8,1,7a53HqqArd4b9NF4XAmlbI,spotify:track:7a53HqqArd4b9NF4XAmlbI,...,-4.057,0,0.0405,0.00786,0.0,0.124,0.457,129.857,162399,4


In [28]:
# sorting for aid in removing duplicates
test_merge_sorted = test_merge.sort_values(['song','date'])
test_merge_sorted.head()

Unnamed: 0,song,performer,date,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,uri,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
18622,2 Much,Justin Bieber,2021-04-03,68,0,68,1,1,0oaY19dUwZimIgzn3ZZLZO,spotify:track:0oaY19dUwZimIgzn3ZZLZO,...,-8.601,1,0.0456,0.593,0.0,0.353,0.167,119.59,152796,4
6002,34+35,Ariana Grande,2021-01-02,40,35,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4
6020,34+35,Ariana Grande,2021-01-02,40,35,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4
6038,34+35,Ariana Grande,2021-01-02,40,35,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4
6056,34+35,Ariana Grande,2021-01-02,40,35,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4


In [29]:
# removing true duplicates (every bit of info the same)
test_merge_drops = test_merge_sorted.drop_duplicates(subset=['song', 'date'], keep='first')
test_merge_drops.head()

Unnamed: 0,song,performer,date,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,uri,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
18622,2 Much,Justin Bieber,2021-04-03,68,0,68,1,1,0oaY19dUwZimIgzn3ZZLZO,spotify:track:0oaY19dUwZimIgzn3ZZLZO,...,-8.601,1,0.0456,0.593,0.0,0.353,0.167,119.59,152796,4
6002,34+35,Ariana Grande,2021-01-02,40,35,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4
6003,34+35,Ariana Grande,2021-01-09,19,40,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4
6004,34+35,Ariana Grande,2021-01-16,13,19,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4
6005,34+35,Ariana Grande,2021-01-23,11,13,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4


In [30]:
# removing track duplicates (same song on different dates)
test_merge_doubledrops = test_merge_drops.drop_duplicates(subset=['song', 'performer'], keep='last')
test_merge_doubledrops.head()

Unnamed: 0,song,performer,date,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,uri,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
18622,2 Much,Justin Bieber,2021-04-03,68,0,68,1,1,0oaY19dUwZimIgzn3ZZLZO,spotify:track:0oaY19dUwZimIgzn3ZZLZO,...,-8.601,1,0.0456,0.593,0.0,0.353,0.167,119.59,152796,4
6019,34+35,Ariana Grande,2021-05-01,26,21,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4
3048,7 Summers,Morgan Wallen,2021-02-13,35,23,18,7,1,4obHzpwGrjoTuZh2DItEMZ,spotify:track:4obHzpwGrjoTuZh2DItEMZ,...,-5.13,1,0.0328,0.417,6.1e-05,0.125,0.571,203.903,210507,4
14558,865,Morgan Wallen,2021-02-20,76,77,46,5,1,0sKEilrw1GrBlG6qsTCrP4,spotify:track:0sKEilrw1GrBlG6qsTCrP4,...,-5.254,1,0.0248,0.723,0.0,0.101,0.367,88.003,190680,4
4219,Adderall (Corvette Corvette),Popp Hunna,2021-02-06,99,90,74,6,1,3aAmQOxRPeKDL0HMWFA5qn,spotify:track:3aAmQOxRPeKDL0HMWFA5qn,...,-11.158,0,0.368,0.272,0.0,0.0666,0.713,141.05,136224,4


In [32]:
# resetting index
test_merge_doubledrops = test_merge_doubledrops.reset_index(drop=True)
test_merge_doubledrops.head()

Unnamed: 0,song,performer,date,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,uri,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2 Much,Justin Bieber,2021-04-03,68,0,68,1,1,0oaY19dUwZimIgzn3ZZLZO,spotify:track:0oaY19dUwZimIgzn3ZZLZO,...,-8.601,1,0.0456,0.593,0.0,0.353,0.167,119.59,152796,4
1,34+35,Ariana Grande,2021-05-01,26,21,2,18,1,6Im9k8u9iIzKMrmV7BWtlF,spotify:track:6Im9k8u9iIzKMrmV7BWtlF,...,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4
2,7 Summers,Morgan Wallen,2021-02-13,35,23,18,7,1,4obHzpwGrjoTuZh2DItEMZ,spotify:track:4obHzpwGrjoTuZh2DItEMZ,...,-5.13,1,0.0328,0.417,6.1e-05,0.125,0.571,203.903,210507,4
3,865,Morgan Wallen,2021-02-20,76,77,46,5,1,0sKEilrw1GrBlG6qsTCrP4,spotify:track:0sKEilrw1GrBlG6qsTCrP4,...,-5.254,1,0.0248,0.723,0.0,0.101,0.367,88.003,190680,4
4,Adderall (Corvette Corvette),Popp Hunna,2021-02-06,99,90,74,6,1,3aAmQOxRPeKDL0HMWFA5qn,spotify:track:3aAmQOxRPeKDL0HMWFA5qn,...,-11.158,0,0.368,0.272,0.0,0.0666,0.713,141.05,136224,4


In [33]:
# saving result of the 2021 fully processed dataset
test_merge_doubledrops.to_csv('../data/billboardAndFeatures2021.csv', index=False)

In [34]:
# reading in the final bb df, cleaning it more
testFinal = pd.read_csv('../data/BB_FINAL.CSV')
testFinal.head()

Unnamed: 0,song_x,performer,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,weeks_on_chart_x,peak_x,chart_position_y,previous_position_y,weeks_on_chart_y,peak_y,chart_position,previous_position,weeks_on_chart,peak
0,Dance Monkey,Tones And I,0.824,0.588,6.0,-6.4,0.0,0.0924,0.692,0.000104,...,8.0,11.0,11.0,19.0,8.0,11.0,11.0,19.0,8.0,11.0
46656,Dance Monkey,Kidz Bop Kids,0.794,0.679,6.0,-5.395,0.0,0.104,0.166,2e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46657,Dance Monkey - Stripped Back,Tones And I,0.664,0.212,6.0,-7.762,0.0,0.046,0.931,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46658,I Bet You Look Good On The Dancefloor,Arctic Monkeys,0.535,0.948,6.0,-4.19,0.0,0.0356,0.00225,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46659,Pray For Me,The Weeknd,0.732,0.678,2.0,-4.977,1.0,0.0886,0.0867,2.4e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
testFinal.columns

Index(['song_x', 'performer', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'id', 'uri_x', 'track_href', 'analysis_url',
       'duration_ms', 'time_signature', 'uri_y', 'hitTF_x', 'hitTF_x.1',
       'chart_position_x', 'previous_position_x', 'weeks_on_chart_x', 'peak_x',
       'chart_position_y', 'previous_position_y', 'weeks_on_chart_y', 'peak_y',
       'chart_position', 'previous_position', 'weeks_on_chart', 'peak'],
      dtype='object')

In [36]:
# removing duplicate cols from the testFinal df
testFinal = testFinal[['song_x', 'performer', 'chart_position', 'previous_position', 'peak', 'weeks_on_chart',
       'hitTF_x', 'id', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature']]
testFinal.head()

Unnamed: 0,song_x,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF_x,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Dance Monkey,Tones And I,11.0,19.0,11.0,8.0,1.0,2XU0oxnq2qxCpomAAuJY8K,0.824,0.588,...,-6.4,0.0,0.0924,0.692,0.000104,0.149,0.513,98.027,209438.0,4.0
46656,Dance Monkey,Kidz Bop Kids,0.0,0.0,0.0,0.0,0.0,6iCX17mrswsT0QgOclgqy3,0.794,0.679,...,-5.395,0.0,0.104,0.166,2e-06,0.134,0.547,98.16,200933.0,4.0
46657,Dance Monkey - Stripped Back,Tones And I,0.0,0.0,0.0,0.0,0.0,1ooBIqsmV1ocRldBZrvLPD,0.664,0.212,...,-7.762,0.0,0.046,0.931,0.0,0.102,0.574,83.245,171461.0,4.0
46658,I Bet You Look Good On The Dancefloor,Arctic Monkeys,0.0,0.0,0.0,0.0,0.0,29EkMZmUNz1WsuzaMtVo1i,0.535,0.948,...,-4.19,0.0,0.0356,0.00225,0.0,0.376,0.778,103.183,173680.0,4.0
46659,Pray For Me,The Weeknd,0.0,0.0,0.0,0.0,0.0,6huNf4dutXRjJyGn7f5BPS,0.732,0.678,...,-4.977,1.0,0.0886,0.0867,2.4e-05,0.112,0.196,100.626,211421.0,4.0


In [37]:
testFinal.columns

Index(['song_x', 'performer', 'chart_position', 'previous_position', 'peak',
       'weeks_on_chart', 'hitTF_x', 'id', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'],
      dtype='object')

In [38]:
# renaming cols
testFinal.columns = ['song', 'performer', 'chart_position', 'previous_position', 'peak',
       'weeks_on_chart', 'hitTF', 'id', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature']
testFinal.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Dance Monkey,Tones And I,11.0,19.0,11.0,8.0,1.0,2XU0oxnq2qxCpomAAuJY8K,0.824,0.588,...,-6.4,0.0,0.0924,0.692,0.000104,0.149,0.513,98.027,209438.0,4.0
46656,Dance Monkey,Kidz Bop Kids,0.0,0.0,0.0,0.0,0.0,6iCX17mrswsT0QgOclgqy3,0.794,0.679,...,-5.395,0.0,0.104,0.166,2e-06,0.134,0.547,98.16,200933.0,4.0
46657,Dance Monkey - Stripped Back,Tones And I,0.0,0.0,0.0,0.0,0.0,1ooBIqsmV1ocRldBZrvLPD,0.664,0.212,...,-7.762,0.0,0.046,0.931,0.0,0.102,0.574,83.245,171461.0,4.0
46658,I Bet You Look Good On The Dancefloor,Arctic Monkeys,0.0,0.0,0.0,0.0,0.0,29EkMZmUNz1WsuzaMtVo1i,0.535,0.948,...,-4.19,0.0,0.0356,0.00225,0.0,0.376,0.778,103.183,173680.0,4.0
46659,Pray For Me,The Weeknd,0.0,0.0,0.0,0.0,0.0,6huNf4dutXRjJyGn7f5BPS,0.732,0.678,...,-4.977,1.0,0.0886,0.0867,2.4e-05,0.112,0.196,100.626,211421.0,4.0


In [39]:
# keeping only hits from that dataset
finalTo2020 = testFinal[testFinal['hitTF'] == 1]
finalTo2020.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Dance Monkey,Tones And I,11.0,19.0,11.0,8.0,1.0,2XU0oxnq2qxCpomAAuJY8K,0.824,0.588,...,-6.4,0.0,0.0924,0.692,0.000104,0.149,0.513,98.027,209438.0,4.0
46669,Mine,Bazzi,56.0,0.0,56.0,1.0,1.0,7uzmGiiJyRfuViKKK3lVmR,0.71,0.789,...,-3.874,1.0,0.0722,0.0161,3e-06,0.451,0.717,142.929,131064.0,4.0
85978,Final Fantasy,Drake,56.0,0.0,56.0,1.0,1.0,44Du2IM1bGY7dicmLfXbUs,0.5,0.449,...,-10.977,1.0,0.442,0.422,8e-06,0.115,0.104,144.206,219960.0,1.0
85991,Hear Me Calling,Juice WRLD,56.0,0.0,56.0,1.0,1.0,13ZyrkCDmRz5xY3seuAWYk,0.699,0.687,...,-3.997,0.0,0.106,0.308,3.6e-05,0.121,0.499,88.932,189977.0,4.0
86059,Liar,Camila Cabello,56.0,0.0,56.0,1.0,1.0,7LzouaWGFCy4tkXDOOnEyM,0.74,0.498,...,-6.684,0.0,0.0456,0.0169,0.00282,0.319,0.652,98.016,207039.0,4.0


In [40]:
# removing the date col from the 2021 dataset as it is no longer needed
processed2021_nodate = test_merge_doubledrops[['song', 'performer', 'chart_position', 'previous_position',
       'peak', 'weeks_on_chart', 'hitTF', 'id', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature']]

In [41]:
# combining the two hits datasets together
fullHits = pd.concat([finalTo2020, processed2021_nodate])
fullHits.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Dance Monkey,Tones And I,11.0,19.0,11.0,8.0,1.0,2XU0oxnq2qxCpomAAuJY8K,0.824,0.588,...,-6.4,0.0,0.0924,0.692,0.000104,0.149,0.513,98.027,209438.0,4.0
46669,Mine,Bazzi,56.0,0.0,56.0,1.0,1.0,7uzmGiiJyRfuViKKK3lVmR,0.71,0.789,...,-3.874,1.0,0.0722,0.0161,3e-06,0.451,0.717,142.929,131064.0,4.0
85978,Final Fantasy,Drake,56.0,0.0,56.0,1.0,1.0,44Du2IM1bGY7dicmLfXbUs,0.5,0.449,...,-10.977,1.0,0.442,0.422,8e-06,0.115,0.104,144.206,219960.0,1.0
85991,Hear Me Calling,Juice WRLD,56.0,0.0,56.0,1.0,1.0,13ZyrkCDmRz5xY3seuAWYk,0.699,0.687,...,-3.997,0.0,0.106,0.308,3.6e-05,0.121,0.499,88.932,189977.0,4.0
86059,Liar,Camila Cabello,56.0,0.0,56.0,1.0,1.0,7LzouaWGFCy4tkXDOOnEyM,0.74,0.498,...,-6.684,0.0,0.0456,0.0169,0.00282,0.319,0.652,98.016,207039.0,4.0


In [42]:
# resetting fullHits index
fullHits = fullHits.reset_index(drop=True)
fullHits.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Dance Monkey,Tones And I,11.0,19.0,11.0,8.0,1.0,2XU0oxnq2qxCpomAAuJY8K,0.824,0.588,...,-6.4,0.0,0.0924,0.692,0.000104,0.149,0.513,98.027,209438.0,4.0
1,Mine,Bazzi,56.0,0.0,56.0,1.0,1.0,7uzmGiiJyRfuViKKK3lVmR,0.71,0.789,...,-3.874,1.0,0.0722,0.0161,3e-06,0.451,0.717,142.929,131064.0,4.0
2,Final Fantasy,Drake,56.0,0.0,56.0,1.0,1.0,44Du2IM1bGY7dicmLfXbUs,0.5,0.449,...,-10.977,1.0,0.442,0.422,8e-06,0.115,0.104,144.206,219960.0,1.0
3,Hear Me Calling,Juice WRLD,56.0,0.0,56.0,1.0,1.0,13ZyrkCDmRz5xY3seuAWYk,0.699,0.687,...,-3.997,0.0,0.106,0.308,3.6e-05,0.121,0.499,88.932,189977.0,4.0
4,Liar,Camila Cabello,56.0,0.0,56.0,1.0,1.0,7LzouaWGFCy4tkXDOOnEyM,0.74,0.498,...,-6.684,0.0,0.0456,0.0169,0.00282,0.319,0.652,98.016,207039.0,4.0


In [67]:
# saving the hits 2018 to 2021 (ie full) to a csv
fullHits.to_csv('../data/billboardAndFeatures2018to2021.csv', index=False)

In [43]:
# reading in tracks csv
justTracks = pd.read_csv('../data/tracks20102021.csv')
justTracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['The Toys'],['6lH5PpuiMa5SpfjoIOlwCS'],3/13/2020,0.671,0.867,2,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,4
1,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],5/4/2018,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3
2,19oquvXf3bc65GSqtPYA5S,It Was A Very Good Year - Live At The Sands Ho...,25,236800,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],5/4/2018,0.269,0.129,7,-18.168,0,0.0576,0.938,5e-06,0.683,0.16,82.332,3
3,55qyghODi24yaDgKBI6lx0,"The Circle Game - Live at The 2nd Fret, Philad...",18,313093,0,['Joni Mitchell'],['5hW4L92KnC6dX9t7tYM4Ve'],10/30/2020,0.644,0.212,11,-14.118,1,0.0347,0.881,2.2e-05,0.798,0.441,117.072,3
4,00xemFYjQNRpOlPhVaLAHa,"Urge For Going - Live at The 2nd Fret, Philade...",18,295093,0,['Joni Mitchell'],['5hW4L92KnC6dX9t7tYM4Ve'],10/30/2020,0.627,0.184,1,-15.533,1,0.045,0.955,0.000162,0.0986,0.299,115.864,4


In [45]:
# removing the excess brackets and quotes
justTracks.loc[:, 'artists'] = justTracks['artists'].apply(lambda x: x.strip("['']"))
justTracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,The Toys,['6lH5PpuiMa5SpfjoIOlwCS'],3/13/2020,0.671,0.867,2,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,4
1,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,Frank Sinatra,['1Mxqyy3pSjf8kZZL4QVxS0'],5/4/2018,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3
2,19oquvXf3bc65GSqtPYA5S,It Was A Very Good Year - Live At The Sands Ho...,25,236800,0,Frank Sinatra,['1Mxqyy3pSjf8kZZL4QVxS0'],5/4/2018,0.269,0.129,7,-18.168,0,0.0576,0.938,5e-06,0.683,0.16,82.332,3
3,55qyghODi24yaDgKBI6lx0,"The Circle Game - Live at The 2nd Fret, Philad...",18,313093,0,Joni Mitchell,['5hW4L92KnC6dX9t7tYM4Ve'],10/30/2020,0.644,0.212,11,-14.118,1,0.0347,0.881,2.2e-05,0.798,0.441,117.072,3
4,00xemFYjQNRpOlPhVaLAHa,"Urge For Going - Live at The 2nd Fret, Philade...",18,295093,0,Joni Mitchell,['5hW4L92KnC6dX9t7tYM4Ve'],10/30/2020,0.627,0.184,1,-15.533,1,0.045,0.955,0.000162,0.0986,0.299,115.864,4


In [46]:
# adding on placeholder cols
justTracks[['chart_position', 'previous_position', 'peak', 'weeks_on_chart', 'hitTF']] = 0
justTracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,chart_position,previous_position,peak,weeks_on_chart,hitTF
0,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,The Toys,['6lH5PpuiMa5SpfjoIOlwCS'],3/13/2020,0.671,0.867,...,0.0,0.139,0.839,120.689,4,0,0,0,0,0
1,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,Frank Sinatra,['1Mxqyy3pSjf8kZZL4QVxS0'],5/4/2018,0.319,0.201,...,0.0,0.904,0.239,117.153,3,0,0,0,0,0
2,19oquvXf3bc65GSqtPYA5S,It Was A Very Good Year - Live At The Sands Ho...,25,236800,0,Frank Sinatra,['1Mxqyy3pSjf8kZZL4QVxS0'],5/4/2018,0.269,0.129,...,5e-06,0.683,0.16,82.332,3,0,0,0,0,0
3,55qyghODi24yaDgKBI6lx0,"The Circle Game - Live at The 2nd Fret, Philad...",18,313093,0,Joni Mitchell,['5hW4L92KnC6dX9t7tYM4Ve'],10/30/2020,0.644,0.212,...,2.2e-05,0.798,0.441,117.072,3,0,0,0,0,0
4,00xemFYjQNRpOlPhVaLAHa,"Urge For Going - Live at The 2nd Fret, Philade...",18,295093,0,Joni Mitchell,['5hW4L92KnC6dX9t7tYM4Ve'],10/30/2020,0.627,0.184,...,0.000162,0.0986,0.299,115.864,4,0,0,0,0,0


In [47]:
# renaming cols
justTracks.columns = ['id', 'song', 'popularity', 'duration_ms', 'explicit', 'performer',
       'id_artists', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'chart_position',
       'previous_position', 'peak', 'weeks_on_chart', 'hitTF']

In [48]:
# removing/rearranging cols
justTracks = justTracks[['song', 'performer', 'chart_position', 'previous_position',
       'peak', 'weeks_on_chart', 'hitTF', 'id', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature']]
justTracks.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,A Lover's Concerto,The Toys,0,0,0,0,0,6Pkt6qVikqPBt9bEQy8iTz,0.671,0.867,...,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,159560,4
1,The September Of My Years - Live At The Sands ...,Frank Sinatra,0,0,0,0,0,1hx7X9cMXHWJjknb9O6Ava,0.319,0.201,...,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,187333,3
2,It Was A Very Good Year - Live At The Sands Ho...,Frank Sinatra,0,0,0,0,0,19oquvXf3bc65GSqtPYA5S,0.269,0.129,...,-18.168,0,0.0576,0.938,5e-06,0.683,0.16,82.332,236800,3
3,"The Circle Game - Live at The 2nd Fret, Philad...",Joni Mitchell,0,0,0,0,0,55qyghODi24yaDgKBI6lx0,0.644,0.212,...,-14.118,1,0.0347,0.881,2.2e-05,0.798,0.441,117.072,313093,3
4,"Urge For Going - Live at The 2nd Fret, Philade...",Joni Mitchell,0,0,0,0,0,00xemFYjQNRpOlPhVaLAHa,0.627,0.184,...,-15.533,1,0.045,0.955,0.000162,0.0986,0.299,115.864,295093,4


In [49]:
# changing default 0s in hitTFs to 1s if they match a song in the hits df
# casting a broad net to hit even songs with same name and artist, even if the spotify ID does not match exactly
hitSongTitles = [x for x in fullHits['song']] 
hitSongPerformers = [x for x in fullHits['performer']]
updatedhitTF = []
for x in range(0, len(justTracks)):
#     if the row matches a song and artist in the lists...
    if justTracks.iloc[x, 0] in hitSongTitles and justTracks.iloc[x, 1] in hitSongPerformers:
#     if justTracks.iloc[x, 7] in hitIDs:
#         ...append a 1
        updatedhitTF.append(1)       
    else:
        updatedhitTF.append(0)

justTracks['hitTF'] = updatedhitTF
justTracks.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,A Lover's Concerto,The Toys,0,0,0,0,0,6Pkt6qVikqPBt9bEQy8iTz,0.671,0.867,...,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,159560,4
1,The September Of My Years - Live At The Sands ...,Frank Sinatra,0,0,0,0,0,1hx7X9cMXHWJjknb9O6Ava,0.319,0.201,...,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,187333,3
2,It Was A Very Good Year - Live At The Sands Ho...,Frank Sinatra,0,0,0,0,0,19oquvXf3bc65GSqtPYA5S,0.269,0.129,...,-18.168,0,0.0576,0.938,5e-06,0.683,0.16,82.332,236800,3
3,"The Circle Game - Live at The 2nd Fret, Philad...",Joni Mitchell,0,0,0,0,0,55qyghODi24yaDgKBI6lx0,0.644,0.212,...,-14.118,1,0.0347,0.881,2.2e-05,0.798,0.441,117.072,313093,3
4,"Urge For Going - Live at The 2nd Fret, Philade...",Joni Mitchell,0,0,0,0,0,00xemFYjQNRpOlPhVaLAHa,0.627,0.184,...,-15.533,1,0.045,0.955,0.000162,0.0986,0.299,115.864,295093,4


In [50]:
# checking to see if songs were actually caught by above function
justTracks['hitTF'].value_counts()

0    120612
1       814
Name: hitTF, dtype: int64

In [51]:
# subsetting tracks to get just non-hit songs
nohits = justTracks[justTracks['hitTF'] == 0]
nohits.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,A Lover's Concerto,The Toys,0,0,0,0,0,6Pkt6qVikqPBt9bEQy8iTz,0.671,0.867,...,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,159560,4
1,The September Of My Years - Live At The Sands ...,Frank Sinatra,0,0,0,0,0,1hx7X9cMXHWJjknb9O6Ava,0.319,0.201,...,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,187333,3
2,It Was A Very Good Year - Live At The Sands Ho...,Frank Sinatra,0,0,0,0,0,19oquvXf3bc65GSqtPYA5S,0.269,0.129,...,-18.168,0,0.0576,0.938,5e-06,0.683,0.16,82.332,236800,3
3,"The Circle Game - Live at The 2nd Fret, Philad...",Joni Mitchell,0,0,0,0,0,55qyghODi24yaDgKBI6lx0,0.644,0.212,...,-14.118,1,0.0347,0.881,2.2e-05,0.798,0.441,117.072,313093,3
4,"Urge For Going - Live at The 2nd Fret, Philade...",Joni Mitchell,0,0,0,0,0,00xemFYjQNRpOlPhVaLAHa,0.627,0.184,...,-15.533,1,0.045,0.955,0.000162,0.0986,0.299,115.864,295093,4


In [66]:
# saving the non hits to a csv
nohits.to_csv('../data/nohits.csv', index=False)

In [53]:
# checking percentage of hit to non hit songs
# percentage of hits should be a bit larger - should take a subset of the non-hits
len(fullHits)/len(nohits) * 100

0.5322853447418168

In [54]:
# want at least 15% hits --> len(fullHits)/0.15 = num of non hits we should have
sampleNum = int(len(fullHits)/0.15)
noHitsForCombo = nohits.sample(n=sampleNum, random_state=1)

In [55]:
# resetting index for no hits subset
noHitsForCombo = noHitsForCombo.reset_index(drop=True)
noHitsForCombo.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Leve Com Você - Ao Vivo,Natiruts,0,0,0,0,0,1wmwhVbst9h3MgCP3ijMN7,0.51,0.776,...,-6.645,0,0.044,0.462,8e-06,0.75,0.508,116.92,203307,4
1,Remember The Youthful Years - Mohamed Ragab Remix,"Pizz@dox', 'Suncatcher",0,0,0,0,0,0VUp3l4R3WBKUFRCTPlvRj,0.497,0.991,...,-4.96,1,0.0558,0.000785,0.855,0.0491,0.595,139.995,500571,4
2,Si Supieras,Pedro Conga,0,0,0,0,0,5x9EdBaPYL2i974nkVImwJ,0.743,0.445,...,-12.516,0,0.041,0.484,1e-06,0.277,0.842,91.488,310933,4
3,Ta Gelia Ton Xamenon,Tzamal,0,0,0,0,0,6zlleBKGeX67kAh7gBDMXw,0.519,0.807,...,-9.072,1,0.362,0.63,0.0,0.122,0.888,81.122,207857,4
4,She's a Rainbow,Lola Marsh,0,0,0,0,0,7JMy73Hh4pNIVuudpUt9u0,0.246,0.693,...,-5.032,0,0.0346,0.00722,0.245,0.323,0.141,173.861,224200,3


In [56]:
# checking for nulls in both dfs before combination
fullHits.isnull().sum()

song                 0
performer            0
chart_position       0
previous_position    0
peak                 0
weeks_on_chart       0
hitTF                0
id                   0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
dtype: int64

In [57]:
# checking for nulls in both dfs before combination
noHitsForCombo.isnull().sum()

song                 0
performer            0
chart_position       0
previous_position    0
peak                 0
weeks_on_chart       0
hitTF                0
id                   0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
dtype: int64

In [58]:
# combining the hits and nonhits datasets
finalDataSet = pd.concat([fullHits, noHitsForCombo])
finalDataSet.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Dance Monkey,Tones And I,11.0,19.0,11.0,8.0,1.0,2XU0oxnq2qxCpomAAuJY8K,0.824,0.588,...,-6.4,0.0,0.0924,0.692,0.000104,0.149,0.513,98.027,209438.0,4.0
1,Mine,Bazzi,56.0,0.0,56.0,1.0,1.0,7uzmGiiJyRfuViKKK3lVmR,0.71,0.789,...,-3.874,1.0,0.0722,0.0161,3e-06,0.451,0.717,142.929,131064.0,4.0
2,Final Fantasy,Drake,56.0,0.0,56.0,1.0,1.0,44Du2IM1bGY7dicmLfXbUs,0.5,0.449,...,-10.977,1.0,0.442,0.422,8e-06,0.115,0.104,144.206,219960.0,1.0
3,Hear Me Calling,Juice WRLD,56.0,0.0,56.0,1.0,1.0,13ZyrkCDmRz5xY3seuAWYk,0.699,0.687,...,-3.997,0.0,0.106,0.308,3.6e-05,0.121,0.499,88.932,189977.0,4.0
4,Liar,Camila Cabello,56.0,0.0,56.0,1.0,1.0,7LzouaWGFCy4tkXDOOnEyM,0.74,0.498,...,-6.684,0.0,0.0456,0.0169,0.00282,0.319,0.652,98.016,207039.0,4.0


In [59]:
finalDataSet.columns

Index(['song', 'performer', 'chart_position', 'previous_position', 'peak',
       'weeks_on_chart', 'hitTF', 'id', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'],
      dtype='object')

In [60]:
# making sure there are no null values
finalDataSet.isnull().sum()

song                 0
performer            0
chart_position       0
previous_position    0
peak                 0
weeks_on_chart       0
hitTF                0
id                   0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
dtype: int64

In [61]:
finalDataSet.shape

(4922, 21)

In [65]:
# exporting final dataset to csv
finalDataSet.to_csv('../data/finalDataSet2018to2021.csv', index=False)