In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import timedelta, date
import time

In [2]:
def scrape_billboard_chart(url, chart_date):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # chart_date = soup.find_all("p", class_="c-tagline a-font-primary-medium-xs u-font-size-11@mobile-max u-letter-spacing-0106 u-letter-spacing-0089@mobile-max lrv-u-line-height-copy lrv-u-text-transform-uppercase lrv-u-margin-a-00 lrv-u-padding-l-075 lrv-u-padding-l-00@mobile-max")
    # chart_date = chart_date[0].text.strip()
    
    top1_song_name = soup.find_all("h3", id="title-of-a-story", class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 u-font-size-23@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-245 u-max-width-230@tablet-only u-letter-spacing-0028@tablet")
    top1_artist_name = soup.find_all("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only u-font-size-20@tablet")
    
    song_names = soup.find_all("h3", id="title-of-a-story", class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only")
    artist_names = soup.find_all("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only")
    # print(song_names)
    # print(artist_names)
    
    chart_data = []
    for i in range(len(song_names)):
        chart_data.append({
            "song_name": song_names[i].text.strip(),
            "artist_name": artist_names[i].text.strip(),
            "chart_date": chart_date
        })
    
    chart_data.append({
        "song_name": top1_song_name[0].text.strip(),
        "artist_name": top1_artist_name[0].text.strip(),
        "chart_date": chart_date    
    })
    
    return chart_data

In [3]:
def scrape_billboard_charts(start_date, end_date):
    billboard_charts = []
    delta = timedelta(days=7)
    while start_date <= end_date:
        year = start_date.strftime("%Y")
        month = start_date.strftime("%m")
        day = start_date.strftime("%d")
        url = f"https://www.billboard.com/charts/hot-100/{year}-{month}-{day}"
        chart_date = f"{year}-{month}-{day}"
        print(chart_date)
        chart_data = scrape_billboard_chart(url, chart_date)
        billboard_charts.extend(chart_data)
        start_date += delta
    return pd.DataFrame(billboard_charts)

In [35]:
start_date = date(2022, 5, 4) # first Saturday of 2010
end_date = date(2022, 11, 30) # current date

In [36]:
billboard_df = scrape_billboard_charts(start_date, end_date)
billboard_df = billboard_df.drop_duplicates(['song_name', 'artist_name']).reset_index(drop=True)

2022-05-04
2022-05-11
2022-05-18
2022-05-25
2022-06-01
2022-06-08
2022-06-15
2022-06-22
2022-06-29
2022-07-06
2022-07-13
2022-07-20
2022-07-27
2022-08-03
2022-08-10
2022-08-17
2022-08-24
2022-08-31
2022-09-07
2022-09-14
2022-09-21
2022-09-28
2022-10-05
2022-10-12
2022-10-19
2022-10-26
2022-11-02
2022-11-09
2022-11-16
2022-11-23
2022-11-30


In [37]:
print(len(billboard_df))
print(billboard_df.head())

498
     song_name                    artist_name  chart_date
0  First Class                    Jack Harlow  2022-05-04
1   Heat Waves                  Glass Animals  2022-05-04
2   Big Energy                          Latto  2022-05-04
3        Enemy          Imagine Dragons X JID  2022-05-04
4         Stay  The Kid LAROI & Justin Bieber  2022-05-04


In [38]:
# azlyric version

def scrape_lyrics(song_name, artist_name):
    
    base_url = 'https://www.azlyrics.com/lyrics/'
    
    if artist_name == "The Weekend":
        artist_name = 'weeknd'
        
    if artist_name == "The Kid Laroi":
        artist_name = 'kidlaroi'
        
    if artist_name == "Tyler, the Creator":
        artist_name = 'tylerthe creator'
    
    artist_name=artist_name.split(" Featuring ")[0]
    artist_name=artist_name.split(" X ")[0]
    artist_name=artist_name.split(" x ")[0]
    artist_name=artist_name.split(", ")[0]
    artist_name=artist_name.split(" & ")[0]
    artist_name=artist_name.split(" + ")[0]
    artist_name=artist_name.replace("-", "")
    artist_name=artist_name.replace("!", "")
    artist_name=artist_name.replace(".", "")
    
    song_name = song_name.replace("'", "")
    song_name = song_name.replace("-", "")
    song_name = song_name.replace(".", "")
    song_name = song_name.replace("!", "")
    song_name = song_name.replace("&", "")
    song_name = song_name.replace("?", "") 
    
    artist_name = artist_name.lower().replace(" ", "")
    song_name = song_name.lower().replace(" ", "")
    
    url = f'{base_url}{artist_name}/{song_name}.html'

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    lyrics_div = soup.find_all("div", attrs={"class": None, "id": None})
    lyrics = [lyric.getText() for lyric in lyrics_div]
    lyrics = " ".join(lyrics)
    time.sleep(5)
    
    return lyrics

In [39]:
all_lyric = []
for i in range(len(billboard_df)):
    try:
        all_lyric.append(scrape_lyrics(billboard_df['song_name'][i], billboard_df['artist_name'][i])) 
        
        if scrape_lyrics(billboard_df['song_name'][i], billboard_df['artist_name'][i]) == '': 
            print(billboard_df['artist_name'][i], billboard_df['song_name'][i], "-failed to find song-", i)
    except:
        if scrape_lyrics(billboard_df['song_name'][i], billboard_df['artist_name'][i]) != '':
            all_lyric.append('')
        print(billboard_df['artist_name'][i], billboard_df['song_name'][i], "-failed to load-")

Latto Big Energy -failed to find song- 2
The Kid LAROI & Justin Bieber Stay -failed to find song- 4
Elton John & Dua Lipa Cold Heart (PNAU Remix) -failed to find song- 12
The Kid LAROI Thousand Miles -failed to find song- 13
The Weeknd & Ariana Grande Save Your Tears -failed to find song- 14
Lil Nas X & Jack Harlow Industry Baby -failed to find song- 20
SZA I Hate U -failed to find song- 37
Carolina Gaitan, Mauro Castillo, Adassa, Rhenzy Feliz, Diane Guerrero, Stephanie Beatriz & Encanto Cast We Don't Talk About Bruno -failed to find song- 38
Cole Swindell / Lainey Wilson Never Say Never -failed to find song- 39
Southside, Travis Scott & Future Hold That Heat -failed to find song- 55
Doja Cat Get Into It (Yuh) -failed to find song- 71
Jessica Darrow Surface Pressure -failed to find song- 73
Maren Morris Circles Around This Town -failed to find song- 80
The Walters I Love You So -failed to find song- 84
The Weeknd Out Of Time -failed to find song- 90
Gunna Banking On Me -failed to find 

In [40]:
len(all_lyric)


498

In [41]:
billboard_df['lyrics'] = all_lyric

In [42]:
%pip install openpyxl
import openpyxl
billboard_df.to_excel('billboard_lyrics2022.xlsx', index=False)

Note: you may need to restart the kernel to use updated packages.
