In [6]:
import requests 
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
from IPython.display import clear_output
import time

def get_artist(decade, soup, artist_list):
    for o in soup.find_all("ol"):
        for l in o.find_all("li"):
            if (a := l.find("a", class_="link-block-target")):
                artist = a.getText()
                if (artist not in artist_list):
                    artist_list.append([decade, artist])

begin_year = 1900
end_year = 2020
num_pages = 10

# Save the content of each page into a list
# To avoid scraping the website repeatedly
artist_list = [["Decade", "Artist"]]
for decade in range (begin_year, end_year+10, 10):
    print(f"Decade: {decade}")
    # For each page for each decade
    for page in range(1, num_pages+1):
        # The url for the first page 
        if (page == 1):
            url = f"https://www.last.fm/tag/{decade}s/artists"
        else:
            url = f"https://www.last.fm/tag/{decade}s/artists?page={page}"
            
        # Get content
        try:
            request = requests.get(url)
            soup = BeautifulSoup(request.text, 'lxml')
            get_artist(decade, soup, artist_list)
        except:
            # If page exceeds the number of pages available - break
            print(f"Page Not Found")
            break
        # Wait 3 seconds as to not overwhelm the website
        time.sleep(3)
    clear_output(wait=True)


Decade: 2020


In [16]:
import csv 

# Write data to a file
# No need to scrape repeatedly
with open('data/artists.csv', "w", encoding="utf-") as f:  
    writer = csv.writer(f)
    writer.writerows(artist_list)  
    f.close()

In [19]:
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from IPython.display import clear_output
import pandas as pd
import os
import time

artists = pd.read_csv("data/artists.csv")
artists

Unnamed: 0,Decade,Artist
0,1900,Luisa Tetrazzini
1,1900,Victor Herbert
2,1900,Walter Van Brunt
3,1900,Jelly Roll Morton
4,1900,Scott Joplin
...,...,...
2366,2020,Antony Milton
2367,2020,Trace Mountains
2368,2020,Adult Mom
2369,2020,Surf Curse


In [61]:
# Settings to use Spotify API
client_id = ""
client_secret_id = ""
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id, client_secret_id))

# Initialize dictionary with empty list
keys = ["id", "name", "decade", "acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "key",
       "liveness", "loudness", "mode", "release_date", "speechiness", "tempo", "time_signature", "valence", "date_match"]

song_data = {key:[] for key in keys}

num_artists = len(artists)
for i in range(num_artists):
    name = artists["Artist"][i]
    decade = artists["Decade"][i]
    
    # Print progress
    percent = int(i/num_artists*100)+1
    load = percent * '*'
    print(f"Finished: [{load.ljust(100)}] {percent}%")
    
    # Get spotify results for artist
    results = spotify.search(q="artist:" + name, type="artist")
    items = results["artists"]["items"]
    if (len(items) > 0):
        # Get uri of the artists and the top tracks
        uri = items[0]["uri"]
        results = spotify.artist_top_tracks(uri)["tracks"]
        if (len(results) > 0):
            # For each track get audio features
            for track in results[:10]:
                ID = track["id"]
                meta = spotify.track(ID)
                
                features = spotify.audio_features(ID)[0]
                
                if (meta is not None and features is not None):
                    song_data["id"].append(ID)
                    song_data["name"].append(track["name"])
                    
                    # If features are included in the dictionary 
                    for f in features:
                        # No need to store ID again
                        if (f != "id" and f in song_data):
                            song_data[f].append(features[f])

                    # Get date and year
                    date = meta["album"]["release_date"]
                    # Check if the release date matches to the decade
                    year = int(date.split("-")[0])
                    if ((year - (year% 10)) == decade):
                        song_data["date_match"].append(1)
                    else:
                        song_data["date_match"].append(0)
                    song_data["release_date"].append(date)
                    song_data["decade"].append(decade)                
                    
    clear_output(wait=True)

df = pd.DataFrame(data=song_data)
df.to_csv('data/music_data.csv', index=False)

Finished: [****************************************************************************************************] 100%
