In [1]:
import pandas as pd

In [2]:
# !pip3 install billboard.py
import billboard

# Database

This is the first step to get the database. Through the Billboard API all top artists and songs of all genres will be retrieved, in order to create a initial database and define the parameters and features to classify an artist into famous or not.

# Functions

### Clean Artist's names 

In [309]:
def clean_artist_name(df:pd.DataFrame, chart_col:str, track_col:str='None')-> pd.DataFrame:
    '''
    This function cleans the artist's name column of a dataframe and 
    returns a new dataframe containing all the other (featuring / collaboration) artists' names. 
    It also drops duplicated aritsts' names.
    
    it needs the input dataframe chart's and track's column names.

    example:
    - chart_col = 'chart'
    - track_col = 'track'
    '''
    
    import re
    
    other_artists = []
    charts = []
    tracks = []
    
    df = df.reset_index(drop=True)

    for i in range(df.shape[0]):
        string = df.loc[i,'artist']
        pattern ='(?:Feat(?:\. |uring )|, |\/|\+|&| [Xx] | \(|\)| [Ww]ith | And | y )'
        artists = re.split(pattern, string)
        df.loc[i, 'artist'] = artists[0]
        
        for j in range(1, len(artists)):
            if artists[j] != '':
                
                if track_col == 'None':
                    other_artists.append(artists[j])
                    charts.append(df.loc[i, chart_col])
                else:
                    other_artists.append(artists[j])
                    tracks.append(df.loc[i, track_col])
                    charts.append(df.loc[i, chart_col])
            else:
                pass
 
    if track_col == 'None':
        other_artists_df = pd.DataFrame({'artist': other_artists,
                                         'chart': charts})
        final_df = pd.concat([df, other_artists_df], axis=0).drop_duplicates(subset='artist').reset_index(drop=True)
    else: 
        other_artists_df = pd.DataFrame({'track': tracks,
                                         'artist': other_artists,
                                         'chart': charts})
        final_df = pd.concat([df, other_artists_df], axis=0).drop_duplicates(subset='track').reset_index(drop=True)    


    return final_df

# Billboard API

The first step is to create a database containing the top ranked billboard artists and songs, in order to find features for each one of them (through the Spotify API).

## Data Gathering

A chart entry (typically a single track) is of type ChartEntry. Each of these track instances have the following attributes:

- `title` – The title of the track.
- `artist` – The name of the artist, as formatted on Billboard.com.
- `image` – The URL of the image for the track.
- `peakPos` – The track's peak position on the chart as of the chart date, as an int (or None if the chart does not include this information).
- `lastPos` – The track's position on the previous week's chart, as an int (or None if the chart does not include this information). This value is 0 if the track was not on the previous week's chart.
- `weeks` – The number of weeks the track has been or was on the chart, including future dates (up until the present time).
- `rank` – The track's current position on the chart.
- `isNew` – Whether the track is new to the chart.

**For this dataset only 4 of them will be used:** 
*'title', 'artist', 'image', 'weeks'*

In [3]:
chart_types = billboard.charts()
chart_types

['hot-100',
 'billboard-200',
 'artist-100',
 'social-50',
 'streaming-songs',
 'radio-songs',
 'digital-song-sales',
 'on-demand-songs',
 'top-album-sales',
 'current-albums',
 'catalog-albums',
 'independent-albums',
 'soundtracks',
 'vinyl-albums',
 'greatest-billboard-200-albums',
 'greatest-billboard-200-artists',
 'greatest-hot-100-singles',
 'greatest-hot-100-artists',
 'greatest-hot-100-songs-by-women',
 'greatest-hot-100-women-artists',
 'greatest-billboard-200-albums-by-women',
 'greatest-billboard-200-women-artists',
 'greatest-billboards-top-songs-80s',
 'greatest-billboards-top-songs-90s',
 'greatest-of-all-time-pop-songs',
 'greatest-of-all-time-pop-songs-artists',
 'greatest-adult-pop-songs',
 'greatest-adult-pop-artists',
 'greatest-country-songs',
 'greatest-country-albums',
 'greatest-country-artists',
 'greatest-hot-latin-songs',
 'greatest-hot-latin-songs-artists',
 'greatest-top-dance-club-artists',
 'greatest-r-b-hip-hop-songs',
 'greatest-r-b-hip-hop-albums',
 'g

In [226]:
# Creating a dataset with all the artists from all charts from billboard
df=pd.DataFrame()
title_list=[]
artist_list=[]
image_list=[]
weeks_list=[]
chart_name=[]

for chart_type in chart_types:
    chart = billboard.ChartData(chart_type)
    for song in chart:
        title_list.append(song.title)
        artist_list.append(song.artist)
        image_list.append(song.image)
        weeks_list.append(song.weeks)
        chart_name.append(chart.name)
        
df = pd.DataFrame({'title':title_list, 
                   'artist':artist_list, 
                   'image':image_list,
                   'weeks':weeks_list,
                   'chart':chart_name})

## Data Cleaning

In [228]:
df.shape

(35621020, 5)

In [229]:
df.head()

Unnamed: 0,title,artist,image,weeks,chart
0,Say So,Doja Cat Featuring Nicki Minaj,,18.0,hot-100
0,Say So,Doja Cat Featuring Nicki Minaj,,18.0,hot-100
1,Savage,Megan Thee Stallion Featuring Beyonce,,8.0,hot-100
0,Say So,Doja Cat Featuring Nicki Minaj,,18.0,hot-100
1,Savage,Megan Thee Stallion Featuring Beyonce,,8.0,hot-100


In [234]:
df=df.drop_duplicates()
df.isna().sum()

In [236]:
df.shape

(8440, 5)

In [238]:
df.to_csv('billboard_artists.csv')

In [3]:
bb = pd.read_csv('data/raw/billboard_artists.csv')

In [4]:
bb.head()

Unnamed: 0.1,Unnamed: 0,title,artist,image,weeks,chart
0,0,Say So,Doja Cat Featuring Nicki Minaj,,18.0,hot-100
1,1,Savage,Megan Thee Stallion Featuring Beyonce,,8.0,hot-100
2,2,Blinding Lights,The Weeknd,,23.0,hot-100
3,3,Toosie Slide,Drake,,5.0,hot-100
4,4,The Box,Roddy Ricch,,22.0,hot-100


#### Artists' Data base

In [10]:
# Separating into 2 datasets: TOP ARTISTS LIST & TOP SONGS

In [11]:
bb_artist = bb.copy()

In [13]:
bb_artist=bb_artist.drop_duplicates(subset='artist')

In [14]:
bb_artist

Unnamed: 0,title,artist,image,weeks,chart
0,Say So,Doja Cat Featuring Nicki Minaj,,18.0,hot-100
1,Savage,Megan Thee Stallion Featuring Beyonce,,8.0,hot-100
2,Blinding Lights,The Weeknd,,23.0,hot-100
3,Toosie Slide,Drake,,5.0,hot-100
4,The Box,Roddy Ricch,,22.0,hot-100
...,...,...,...,...,...
8426,Bella Ciao,Manu Pilas,https://www.billboard.com/assets/1588965272/im...,5.0,world-digital-song-sales
8433,Nominate,Stonebwoy & Keri Hilson,https://charts-static.billboard.com/img/1840/1...,1.0,world-digital-song-sales
8434,Du Hast,Rammstein,https://charts-static.billboard.com/img/2015/1...,215.0,world-digital-song-sales
8435,Oh Nanana,dj 6rb & Bonde R300 Featuring XANG & Mayklove,https://www.billboard.com/assets/1588965272/im...,9.0,world-digital-song-sales


In [16]:
bb_artist.isna().sum()

title      281
artist       0
image      204
weeks     1162
chart        0
dtype: int64

In [18]:
bb_artist=bb_artist.drop(columns=['weeks','image','title'])
bb_artist

Unnamed: 0,artist,chart
0,Doja Cat Featuring Nicki Minaj,hot-100
1,Megan Thee Stallion Featuring Beyonce,hot-100
2,The Weeknd,hot-100
3,Drake,hot-100
4,Roddy Ricch,hot-100
...,...,...
8426,Manu Pilas,world-digital-song-sales
8433,Stonebwoy & Keri Hilson,world-digital-song-sales
8434,Rammstein,world-digital-song-sales
8435,dj 6rb & Bonde R300 Featuring XANG & Mayklove,world-digital-song-sales


In [29]:
bb_artist=bb_artist.reset_index(drop=True)

In [38]:
bb_artist.to_csv('data/processed/bb_artists_unique.csv', index=False)

In [208]:
bb_artist = pd.read_csv('data/processed/bb_artists_unique.csv')

In [109]:
bb_artist.shape, bb_artist.head(10)

((2607, 2),
                                   artist    chart
 0         Doja Cat Featuring Nicki Minaj  hot-100
 1  Megan Thee Stallion Featuring Beyonce  hot-100
 2                             The Weeknd  hot-100
 3                                  Drake  hot-100
 4                            Roddy Ricch  hot-100
 5                               Dua Lipa  hot-100
 6          Drake Featuring Playboi Carti  hot-100
 7                            Post Malone  hot-100
 8           DaBaby Featuring Roddy Ricch  hot-100
 9          Justin Bieber Featuring Quavo  hot-100)

In [None]:
# clean dataset with clean_artist_name function 

In [315]:
bb_artist_clean=clean_artist_name(bb_artist, 'chart')

In [316]:
bb_artist_clean.shape

(3021, 2)

In [317]:
bb_artist_clean.head()

Unnamed: 0,artist,chart
0,Doja Cat,hot-100
1,Megan Thee Stallion,hot-100
2,The Weeknd,hot-100
3,Drake,hot-100
4,Roddy Ricch,hot-100


In [318]:
bb_artist_clean.to_csv('data/processed/bb_artists_unique_clean.csv')

#### Songs dataset

In [33]:
# The Top songs dataset

In [211]:
bb_songs = bb.copy()

In [212]:
bb_songs=bb_songs.drop_duplicates(subset='title')

In [213]:
bb_songs.head()

Unnamed: 0,title,artist,image,weeks,chart
0,Say So,Doja Cat Featuring Nicki Minaj,,18.0,hot-100
1,Savage,Megan Thee Stallion Featuring Beyonce,,8.0,hot-100
2,Blinding Lights,The Weeknd,,23.0,hot-100
3,Toosie Slide,Drake,,5.0,hot-100
4,The Box,Roddy Ricch,,22.0,hot-100


In [214]:
bb_songs.isna().sum()

title        1
artist       0
image      292
weeks     1963
chart        0
dtype: int64

In [215]:
bb_songs=bb_songs.drop(columns=['image','weeks'])

In [216]:
bb_songs.isna().sum()

title     1
artist    0
chart     0
dtype: int64

In [217]:
bb_songs.loc[bb_songs.title.isna(),:]

Unnamed: 0,title,artist,chart
300,,Kenny Chesney,artist-100


In [218]:
bb_songs=bb_songs.dropna().reset_index(drop=True)

In [219]:
bb_songs.head(), bb_songs.shape

(             title                                 artist    chart
 0           Say So         Doja Cat Featuring Nicki Minaj  hot-100
 1           Savage  Megan Thee Stallion Featuring Beyonce  hot-100
 2  Blinding Lights                             The Weeknd  hot-100
 3     Toosie Slide                                  Drake  hot-100
 4          The Box                            Roddy Ricch  hot-100,
 (4082, 3))

In [220]:
bb_songs.columns = ['track','artist','chart']

In [221]:
bb_songs.head()

Unnamed: 0,track,artist,chart
0,Say So,Doja Cat Featuring Nicki Minaj,hot-100
1,Savage,Megan Thee Stallion Featuring Beyonce,hot-100
2,Blinding Lights,The Weeknd,hot-100
3,Toosie Slide,Drake,hot-100
4,The Box,Roddy Ricch,hot-100


In [139]:
bb_songs.to_csv('data/processed/bb_songs_unique.csv')

In [319]:
bb_songs = pd.read_csv('data/processed/bb_songs_unique.csv').drop(columns='Unnamed: 0')

In [321]:
bb_songs_unique_clean = clean_artist_name(bb_songs, 'chart', 'track')

In [322]:
bb_songs_unique_clean.to_csv('data/processed/bb_songs_unique_clean.csv')