# Spotify Charts Web Scraper

This code scrapes the Spotify Charts website, gets the necessary data from the Top 200 list (songs, artists, listen counts, song IDs, and ranks in each country at each date), and creates a separate data file for each country for which the data is available. Based on: https://gist.github.com/hktosun/d4f98488cb8f005214acd12296506f48 and https://medium.com/the-innovation/how-to-scrape-the-most-popular-songs-on-spotify-using-python-8a8979fa6b06.

In [1]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta, date
from tqdm.notebook import tqdm # added for progress bars
import time

It generates a list of dates between Jan 1, 2017 and today in YYYY-MM-DD format.

In [2]:
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

It creates the list of page links we will get the data from.

In [3]:
base_url = 'https://spotifycharts.com/regional/'

def create_links(country):
    start_date = date(2017, 1, 6) # adjusting the start date
    end_date = datetime.today().date()
    links = []
    dates = list(daterange(start_date, end_date))[::7] # adjusted for weekly rather than daily
    for start, end in zip(dates, dates[1:]):
        links.append(base_url + country + '/weekly/' + start.strftime('%Y-%m-%d') + '--' + end.strftime('%Y-%m-%d')) # add end date to create interval
    return(links, dates)

It reads the webpage.

In [4]:
def get_webpage(link):
    start = time.time()
    page = requests.get(link)
    soup = bs(page.content, 'html.parser')
    return(soup)

In [5]:
# Update: 2021-05-2: unfortunately, Spotify seem to have revoked permission for querying the source code of their webpage,
# or at least through the "requests" library. Moreover, Spotify also have changed the URL, whereas previously
# the country was indicated by its full name, it's now indicated by the Alpha-2 country codes.

requests.get("https://spotifycharts.com/regional/de/weekly/2017-01-06--2017-01-13")

<Response [403]>

It collects the data for each country, and write them in a list. The entries are (in order): Song, Artist, Date, Song ID, Play Count, Rank.

In [6]:
def get_data(country):
    [links, dates] = create_links(country);
    rows = []
    for (link, date) in zip(links, tqdm(dates)): # zip added to generate series of tuples
        start = time.time()
        soup = get_webpage(link)
        entries = soup.find_all('td', class_ = 'chart-table-track')
        streams = soup.find_all('td', class_ = 'chart-table-streams')
        url = soup.find_all('td', class_ = 'chart-table-image') # add url to get song ID
        for i, (entry, stream, url) in enumerate(zip(entries, streams, url)): # add url
            song = entry.find('strong').get_text()
            artist = entry.find('span').get_text()[3:]
            songid = url.find('a').get('href') # get url 
            # songid = songid.split('track/')[1] # split url and extract ID
            play_count = stream.get_text()
            rows.append([song, artist, date, songid, play_count, i + 1])

    return(rows)

Due to same connection error the scraping had to start over a couple of times. To not start from stratch a list was build containing the concluded countries. Names have to be transformed to ISO-2 codes.

In [7]:
# due to connection issues, the scraping had to start several times over. this tuple indicted the scraped countries
# such that starting from scratch was not neccessary.

done = [] # insert country names when done

import country_converter as coco
iso2_codes = coco.convert(names = done, to = 'ISO2', not_found = None)
iso2_codes_lower = [x.lower() for x in iso2_codes]

# Spotify offers the global charts as well which are obviously not found by the country_converted. warning can be ignored.

It exports the data for each country in a csv format. The column names are Song, Artist, Song ID, Date, Streams, Rank.

In [8]:
file_path = 'data/1_charts_per_country/'

def save_data(country):
    if not os.path.exists('data'):
        os.makedirs('data')
    file_name = file_path + country[1].replace(' ', '_').lower() + '.csv'
    if country[0] not in iso2_codes_lower:
        print(country[0])
        data = get_data(country[0])
        if(len(data) != 0):
            data = pd.DataFrame(data, columns = ['Song', 'Artist', 'Date', 'Song ID', 'Streams', 'Rank']) # add 'Song ID'
            data.to_csv(file_name, sep = ',', float_format = '%s', index = False)

It generates a list of countries for which the data is provided.

In [9]:
def get_countries():
    page = requests.get(base_url)
    soup = bs(page.content, 'html.parser')
    countries = []
    ctys = soup.find('ul').findAll('li')
    for cty in ctys:
        countries.append([cty["data-value"], cty.get_text()])
    return(countries)

It runs the function save_data for each country. In other words, it creates the .csv data files for each country.

In [10]:
def scrape_data():
    countries = get_countries()
    for country in tqdm(countries):
        save_data(country)

scrape_data()

AttributeError: 'NoneType' object has no attribute 'findAll'