#Install External Packages

In [1]:
!pip install cloudscraper

Collecting cloudscraper
  Downloading cloudscraper-1.2.60-py2.py3-none-any.whl (97 kB)
[?25l[K     |███▍                            | 10 kB 22.2 MB/s eta 0:00:01[K     |██████▊                         | 20 kB 13.0 MB/s eta 0:00:01[K     |██████████▏                     | 30 kB 11.1 MB/s eta 0:00:01[K     |█████████████▌                  | 40 kB 10.9 MB/s eta 0:00:01[K     |████████████████▉               | 51 kB 10.8 MB/s eta 0:00:01[K     |████████████████████▎           | 61 kB 12.4 MB/s eta 0:00:01[K     |███████████████████████▋        | 71 kB 11.7 MB/s eta 0:00:01[K     |███████████████████████████     | 81 kB 10.6 MB/s eta 0:00:01[K     |██████████████████████████████▍ | 92 kB 11.6 MB/s eta 0:00:01[K     |████████████████████████████████| 97 kB 2.1 MB/s 
Collecting requests-toolbelt>=0.9.1
  Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 923 kB/s 
Installing collected packages: requests-toolbe

#Imports

In [7]:

import cloudscraper
import pandas as pd
import os
from bs4 import BeautifulSoup as bs
from datetime import timedelta, date, datetime

#Scrapper

In [10]:
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

def get_weekly_date_pairs(start_date, end_date):
    date_pairs = []
    next_date = timedelta(days=7)

    temp_date = start_date
    while True:
        date_pairs.append(temp_date)
        temp_date += next_date
        if temp_date > end_date:
            break
            
    return list(zip(date_pairs, date_pairs[1:]))

def create_links(country, start_date, end_date):
    links = []
    week_start_dates = []
    weekly_dates = get_weekly_date_pairs(start_date, end_date)
    for (start, end) in weekly_dates:
        week_start_dates.append(start)
        links.append('https://spotifycharts.com/regional/' + country + '/weekly/' + start.strftime("%Y-%m-%d") + '--' + end.strftime("%Y-%m-%d"))
    return(links, week_start_dates)

def get_webpage(scraper, link):
    page = scraper.get(link)
    soup = bs(page.content, 'html.parser', from_encoding="utf-8")
    return(soup)

def get_data(scraper, country, start_date, end_date):
    [links, week_start_dates] = create_links(country[0], start_date, end_date);
    top_song = []

    for (link, week_start_date) in zip(links, week_start_dates):
        soup = get_webpage(scraper, link)
        entries = soup.find_all("td", class_ = "chart-table-track")
        streams = soup.find_all("td", class_ ="chart-table-streams")
        songs = soup.find_all("td", class_ ="chart-table-image")

        if(len(entries) > 0 and len(streams) > 0 and len(songs) > 0):
          song = entries[0].find('strong').get_text()
          song_id = songs[0].find("a").get("href").split("track/")[1]
          artist = entries[0].find('span').get_text()[3:]
          play_count = streams[0].get_text()
          top_song = top_song + [[song, song_id, artist, play_count, week_start_date, country[1]]]
  
    return(top_song)

def get_countries(scraper):
    page = scraper.get('https://spotifycharts.com/regional')
    soup = bs(page.content, 'html.parser')
    countries = []
    ctys = soup.find('ul').findAll("li")
    for cty in ctys:
      if cty.get_text() != 'Global':
        countries.append([cty["data-value"], cty.get_text()])
        countries.append(cty.get_text())  
    return(countries)

def scrape_data(start_date, end_date, file_name = '',  countries = None):
  charts_data = pd.DataFrame(columns=['Song', 'Song Id', 'Artist', 'Streams', 'Week', 'Region'])
  scraper = cloudscraper.create_scraper()
  if countries == None:
    countries = get_countries(scraper)
  for country in countries:
    country_data = get_data(scraper, country, start_date, end_date)
    if(len(country_data)!= 0):
      print(f'Extracted data for region: {country_data[0][5]}')
      for data_row in country_data:
        charts_data = charts_data.append({'Song' : data_row[0], 'Song Id' : data_row[1], 'Artist' : data_row[2],'Streams' : data_row[3], 'Week' : data_row[4], 'Region' : data_row[5]}, ignore_index = True)
  charts_data.to_csv(file_name, sep=',', float_format='%s', index = False)

#Fetch and Save Data

In [11]:
start_date = date(2017, 1, 6)
end_date = date(2022, 4, 8)

scrape_data(start_date, end_date, 'spotify_charts_data_all_regions.csv')
scrape_data(start_date, end_date, 'spotify_charts_data_global.csv', [['global', 'Global'], 'Global'])

Extracted data for region: United States
Extracted data for region: United Kingdom
Extracted data for region: United Arab Emirates
Extracted data for region: Argentina
Extracted data for region: Austria
Extracted data for region: Australia
Extracted data for region: Belgium
Extracted data for region: Bulgaria
Extracted data for region: Bolivia
Extracted data for region: Brazil
Extracted data for region: Belarus
Extracted data for region: Canada
Extracted data for region: Switzerland
Extracted data for region: Chile
Extracted data for region: Colombia
Extracted data for region: Costa Rica
Extracted data for region: Cyprus
Extracted data for region: Czech Republic
Extracted data for region: Germany
Extracted data for region: Denmark
Extracted data for region: Dominican Republic
Extracted data for region: Ecuador
Extracted data for region: Estonia
Extracted data for region: Egypt
Extracted data for region: Spain
Extracted data for region: Finland
Extracted data for region: France
Extracte

#Data Pre-Processing


In [19]:
region_data = pd.read_csv('spotify_charts_data_all_regions.csv')
region_data['Streams'].replace(',','', regex=True, inplace=True)
region_data['Streams'] = pd.to_numeric(region_data['Streams'])
region_data['Artist'] = region_data['Artist'].str.split(',').str[0]
region_data['Week'] = pd.to_datetime(region_data.Week, format='%Y-%m-%d')
region_data['Year'] = region_data['Week'].dt.year
region_data.drop(columns=['Week'], inplace=True)
region_data.head()

Unnamed: 0,Song,Song Id,Artist,Streams,Region,Year
0,Shape of You,7qiZfU4dY1lWllzX7mPBI3,Ed Sheeran,10969080,United States,2017
1,Bad and Boujee (feat. Lil Uzi Vert),4Km5HrUvYTaSUfiSGPJeQR,Migos,10639198,United States,2017
2,Bad and Boujee (feat. Lil Uzi Vert),4Km5HrUvYTaSUfiSGPJeQR,Migos,9968813,United States,2017
3,Bad and Boujee (feat. Lil Uzi Vert),4Km5HrUvYTaSUfiSGPJeQR,Migos,11504142,United States,2017
4,Shape of You,7qiZfU4dY1lWllzX7mPBI3,Ed Sheeran,10456635,United States,2017


In [20]:
tranformed_region_data = region_data.groupby(['Year', 'Region', 'Song Id'], as_index=False)['Streams'].sum()
tranformed_region_data.rename(columns={"Streams": "Total Streams"}, inplace=True)
tranformed_region_data = tranformed_region_data[tranformed_region_data.groupby(['Year', 'Region'])['Total Streams'].transform(max) == tranformed_region_data['Total Streams']].sort_values('Year', ascending=False)
tranformed_region_data.head()

Unnamed: 0,Year,Region,Song Id,Total Streams
5275,2022,Viet Nam,5QDLhrAOJJdNAmCTJ8xMyW,3944443
5034,2022,Egypt,5CeSq5JBv2wCeIHZA0PLtT,2292052
5048,2022,Finland,55qPuI6NekVyNPM6kh5DN4,4700378
5052,2022,France,6HTJgY7iWWlVLSDHsfHePf,17955831
5055,2022,Germany,1W1N3usYjgI3XGQA0iX9Vq,18225632


In [21]:
region_data = pd.merge(tranformed_region_data, region_data[['Song Id', 'Song', 'Artist']].drop_duplicates('Song Id'), how="left", on=['Song Id'])
region_data.head()

Unnamed: 0,Year,Region,Song Id,Total Streams,Song,Artist
0,2022,Viet Nam,5QDLhrAOJJdNAmCTJ8xMyW,3944443,Dynamite,BTS
1,2022,Egypt,5CeSq5JBv2wCeIHZA0PLtT,2292052,البخت,Wegz
2,2022,Finland,55qPuI6NekVyNPM6kh5DN4,4700378,Ram pam pam,BESS
3,2022,France,6HTJgY7iWWlVLSDHsfHePf,17955831,Jefe,Ninho
4,2022,Germany,1W1N3usYjgI3XGQA0iX9Vq,18225632,Sehnsucht,Miksu / Macloud


In [22]:
region_data.to_csv('spotify_charts_data_by_region_eda.csv', index=False)

In [23]:
region_data.sort_values(by='Region')
all_years_region_data = region_data.drop(columns=['Year'])
all_years_region_data = all_years_region_data[all_years_region_data.groupby(['Region'])['Total Streams'].transform(max) == all_years_region_data['Total Streams']].sort_values('Region', ascending=True)
all_years_region_data['Total Streams'] = round(all_years_region_data['Total Streams'].rank(pct=True) * 100, 0)
all_years_region_data.head()

Unnamed: 0,Region,Song Id,Total Streams,Song,Artist
208,Argentina,7k4t7uLgtOxPwTpFmtJNTY,84.0,Tusa,KAROL G
240,Australia,1rgnBhdG2JDFTbYkYRZAku,88.0,Dance Monkey,Tones And I
239,Austria,1rgnBhdG2JDFTbYkYRZAku,44.0,Dance Monkey,Tones And I
24,Belarus,48fzAdCxamFYCHNXXLk6zn,8.0,showdown,shadowraze
238,Belgium,1rgnBhdG2JDFTbYkYRZAku,67.0,Dance Monkey,Tones And I


In [24]:
all_years_region_data['Frequency'] = all_years_region_data.groupby('Artist')['Artist'].transform('count')
all_years_region_data.sort_values('Frequency', ascending=False, inplace=True)

In [25]:
all_years_region_data.to_csv('spotify_charts_data_all_years_by_region_eda.csv', index=False)

In [26]:
global_region_data = pd.read_csv('spotify_charts_data_global.csv')
global_region_data['Streams'].replace(',','', regex=True, inplace=True)
global_region_data['Streams'] = pd.to_numeric(global_region_data['Streams'])
global_region_data['Artist'] = global_region_data['Artist'].str.split(',').str[0]
global_region_data = global_region_data.groupby(['Artist'], as_index=False)['Streams'].count().sort_values(by='Streams', ascending=False)
global_region_data.rename(columns={"Streams": "Number of Appearances in Top Weekly Songs"}, inplace=True)
global_region_data.reset_index(drop=True, inplace=True)
global_region_data.index += 1
global_region_data.index.names = ['Rank']
global_region_data.head()

Unnamed: 0_level_0,Artist,Number of Appearances in Top Weekly Songs
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Post Malone,26
2,Drake,22
3,Ed Sheeran,20
4,Ariana Grande,20
5,Tones And I,17


In [27]:
global_region_data.to_csv('spotify_charts_data_global_artist.csv')