In [1]:
from fycharts.SpotifyCharts import SpotifyCharts
import numpy as np
import pandas as pd
import sqlalchemy
import matplotlib
from matplotlib import pyplot as plt
from datetime import date, timedelta
import plotly.graph_objects as go
from enum import Enum, auto
from spoticharts import SpotiCharts
from functools import lru_cache
from collections import defaultdict
import statistics

In [2]:
sc = SpotiCharts()

2021-03-31 11:41:18,277 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-03-31 11:41:18,278 INFO sqlalchemy.engine.base.Engine ()
2021-03-31 11:41:18,279 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-03-31 11:41:18,279 INFO sqlalchemy.engine.base.Engine ()


In [3]:
class Regions(Enum):
    PL = 'Poland'
    DE = 'Germany'
    CZ = 'Czechia'
    GLOBAL = 'Global'
    US = 'United States'
    MX = 'Mexico'
    CA = 'Canada'

In [4]:
DEMOGRAPHICS = {
    Regions.PL : 38, 
    Regions.DE : 83, 
    Regions.CZ : 10,
    Regions.GLOBAL: 7800,
    Regions.US: 328,
    Regions.MX: 126,
    Regions.CA: 38
} # in milions

In [5]:
COUNTRY_CODES = {
    Regions.PL : 'pl',
    Regions.DE : 'de', 
    Regions.CZ : 'cz',
    Regions.GLOBAL: 'global',
    Regions.US: 'us',
    Regions.MX: 'mx',
    Regions.CA: 'ca'
}

@lru_cache(maxsize=8)
def viral_50_dataframe(country_code, include_features=False):
    return sc.get_all_viral(country_codes=[country_code], include_audio_features=include_features)

@lru_cache(maxsize=8)
def top_200_dataframe(country_code, include_features=False):
    return sc.get_all_top_200(country_codes=[country_code], include_audio_features=include_features)

def apply_producer(chart, region):
    return chart['producer'](COUNTRY_CODES[region])

def apply_producer_with_audio_features(chart, region):
    return chart['producer'](COUNTRY_CODES[region], True)

In [6]:
CHARTS = {
    'top200' : {
        'name': 'TOP 200', 
        'producer': top_200_dataframe,
    },
    'viral50': {
        'name': 'VIRAL 50', 
        'producer': viral_50_dataframe,
    }
}

# Plot utils

In [7]:
font = {'weight' : 'bold',
        'size'   : 16}

matplotlib.rc('font', **font)

In [8]:
def plot_multiple_bar(data, title, with_labels=True):
    fig = add_multiple_sources(go.Figure(), data, with_labels)
    fig = add_timeline(fig, title)
    fig.show()

In [9]:
def add_multiple_sources(fig, data, with_labels=True):  
    for name, values in data:
        if with_labels:
            x, y, labels = values
            fig.add_trace(
                go.Bar(x=x, 
                       y=y, 
                       name=name.value, 
                       text=labels,
                       hoverinfo='text'))
        else:
            x, y = values
            fig.add_trace(
                go.Bar(x=x, 
                       y=y, 
                       name=name.value))
        
    return fig

In [10]:
def add_timeline(fig, title):  
    fig.update_layout(
        title_text=title
    )
    
    fig.update_layout(
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1,
                         label="1m",
                         step="month",
                         stepmode="backward"),
                    dict(count=6,
                         label="6m",
                         step="month",
                         stepmode="backward"),
                    dict(count=1,
                         label="YTD",
                         step="year",
                         stepmode="todate"),
                    dict(count=1,
                         label="1y",
                         step="year",
                         stepmode="backward"),
                    dict(step="all")
                ])
            ),
            rangeslider=dict(
                visible=True
            ),
            type="date"
        )
    )
    return fig    

## Metrics

In [11]:
def total_streams(region, artist_all, requested_dates):
    date_streams = {}
    date_songs = {}
    for vals in artist_all[['date', 'streams', 'track_name']].values:
            date_streams[vals[0]] = date_streams.get(vals[0], 0)
            date_streams[vals[0]] += vals[1] 
            date_songs[vals[0]] = date_songs.get(vals[0], '')
            date_songs[vals[0]] += '<br>' + vals[2]
    return ([date_streams.get(date, 0) / DEMOGRAPHICS[region] for date in requested_dates], 
            [date_songs.get(date, '') for date in requested_dates])

In [12]:
def number_of_songs(_region, artist_all, requested_dates):
    occurences = {}
    songs = {}
    for vals in artist_all[['date', 'position', 'track_name']].values:
            occurences[vals[0]] = occurences.get(vals[0], 0)
            occurences[vals[0]] += 1  
            songs[vals[0]] = songs.get(vals[0], '')
            songs[vals[0]] += '<br>' + vals[2]
    return ([occurences.get(date, 0) for date in requested_dates],
            [songs.get(date, '') for date in requested_dates])

In [13]:
def max_rank(_region, artist_all, requested_dates):
    ranks = {}
    date_songs = {}
    for vals in artist_all[['date', 'position', 'track_name']].values:
        rank_of_song = vals[1]
        val = ranks.get(vals[0], 201)
        if val > rank_of_song:
            ranks[vals[0]] = rank_of_song
            date_songs[vals[0]] = vals[2]
    return ([ranks.get(date, 0) for date in requested_dates], 
            [date_songs.get(date, '') for date in requested_dates])

In [36]:
def average_metric(_region, df, requested_dates, audio_feature):
    date_to_features = defaultdict(list)
    for vals in df[['date', audio_feature]].values:
        date_to_features[vals[0]].append(vals[1])
    return ([statistics.mean(date_to_features.get(date, [0])) for date in requested_dates], 
            ['' for date in requested_dates])

## Plots per artist

In [15]:
def plot(start_date, end_date, artist_name, regions, metric, title, chart):
    dates_range = pd.date_range(start_date,end_date,freq='d')
    requested_dates = list(dates_range.date)
    data = [] # (country, (dates, values))
    for region in regions:
        df = apply_producer(chart, region)
        
        artist_all = df[(df['artist'] == artist_name)]
        values, labels = metric(region, artist_all, requested_dates)
        data.append((region, (requested_dates, values, labels)))
    plot_multiple_bar(data, title)                 

In [16]:
def plot_total_streams(chart, start_date, end_date, artist_name, regions):
    title = '[{}] Total number of streams for: "{}" from: {} to: {}'.format(chart['name'], artist_name, start_date, end_date)
    plot(start_date, end_date, artist_name, regions, total_streams, title, chart)

In [17]:
def plot_number_of_songs(chart, start_date, end_date, artist_name, regions):
    title = '[{}] Number of songs: "{}" from: {} to: {}'.format(chart['name'], artist_name, start_date, end_date)
    plot(start_date, end_date, artist_name, regions, number_of_songs, title, chart)

In [18]:
def plot_max_rank(chart, start_date, end_date, artist_name, regions):
    title = '[{}] Max rank of: "{}" from: {} to: {}'.format(chart['name'], artist_name, start_date, end_date)
    plot(start_date, end_date, artist_name, regions, max_rank, title, chart)

## Plots per song

In [19]:
def plot_songs_with_keyword(start_date, end_date, keyword, regions, metric, title, chart):
    dates_range = pd.date_range(start_date,end_date,freq='d')
    requested_dates = list(dates_range.date)
    data = [] # (country, (dates, values))
    for region in regions:
        df = apply_producer(chart, region)
        songs_all = df[df['track_name'].str.contains(keyword, na=False)]
        values, labels = metric(region, songs_all, requested_dates)
        data.append((region, (requested_dates, values, labels)))
    plot_multiple_bar(data, title) 

In [20]:
def plot_total_streams_of_songs_with_keyword(chart, start_date, end_date, keyword, regions):
    title = '[{}] Total number of streams for: {} from: {} to: {}'.format(chart['name'], keyword, start_date, end_date)
    plot_songs_with_keyword(start_date, end_date, keyword, regions, total_streams, title, chart)

In [21]:
def plot_number_of_songs_with_keyword(chart, start_date, end_date, keyword, regions):
    title = '[{}] Total number of streams for: {} from: {} to: {}'.format(chart['name'], keyword, start_date, end_date)
    plot_songs_with_keyword(start_date, end_date, keyword, regions, number_of_songs, title, chart)

In [22]:
def plot_max_rank_of_songs_with_keyword(chart, start_date, end_date, keyword, regions):
    title = '[{}] Total number of streams for: {} from: {} to: {}'.format(chart['name'], keyword, start_date, end_date)
    plot_songs_with_keyword(start_date, end_date, keyword, regions, max_rank, title, chart)

## Plots with aggregated feature characteristics

In [37]:
def plot_songs_with_audio_feature(start_date, end_date, audio_feature, regions, metric, title, chart):
    dates_range = pd.date_range(start_date,end_date,freq='d')
    requested_dates = list(dates_range.date)
    data = [] # (country, (dates, values))
    for region in regions:
        df = apply_producer_with_audio_features(chart, region)
        values, labels = metric(region, df, requested_dates, audio_feature)
        data.append((region, (requested_dates, values, labels)))
    plot_multiple_bar(data, title) 

In [38]:
def plot_average_value_of_audio_feature(chart, start_date, end_date, audio_feature, regions):
    title = '[{}] Average value of metric: {} from: {} to: {}'.format(chart['name'], audio_feature, start_date, end_date)
    plot_songs_with_audio_feature(start_date, end_date, audio_feature, regions, average_metric, title, chart)

# Histograms

## TOP 200

## Total number of streams

### The Weeknd

In [135]:
plot_total_streams(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'The Weeknd', 
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

### Taco Hemingway

In [136]:
plot_total_streams(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Taco Hemingway',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

### Billie Eilish

In [137]:
plot_total_streams(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Billie Eilish',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

In [138]:
plot_total_streams(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Billie Eilish',
                   [Regions.US, 
                    Regions.CA, 
                    Regions.MX])

2021-03-31 11:08:21,405 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2021-03-31 11:08:21,406 INFO sqlalchemy.engine.base.Engine SELECT "Top200".id AS "Top200_id", "Top200".spotify_id AS "Top200_spotify_id", "Top200".date AS "Top200_date", "Top200".region AS "Top200_region", "Top200".position AS "Top200_position", "Top200".track_name AS "Top200_track_name", "Top200".artist AS "Top200_artist", "Top200".streams AS "Top200_streams" 
FROM "Top200" 
WHERE "Top200".region IN (?)
2021-03-31 11:08:21,406 INFO sqlalchemy.engine.base.Engine ('us',)
2021-03-31 11:08:26,783 INFO sqlalchemy.engine.base.Engine ROLLBACK
2021-03-31 11:08:27,720 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2021-03-31 11:08:27,721 INFO sqlalchemy.engine.base.Engine SELECT "Top200".id AS "Top200_id", "Top200".spotify_id AS "Top200_spotify_id", "Top200".date AS "Top200_date", "Top200".region AS "Top200_region", "Top200".position AS "Top200_position", "Top200".track_name AS "Top200_track_name", "Top200".artist

In [139]:
plot_total_streams(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Billie Eilish',
                   [Regions.US, 
                    Regions.GLOBAL, 
                    Regions.PL])

2021-03-31 11:08:36,178 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2021-03-31 11:08:36,179 INFO sqlalchemy.engine.base.Engine SELECT "Top200".id AS "Top200_id", "Top200".spotify_id AS "Top200_spotify_id", "Top200".date AS "Top200_date", "Top200".region AS "Top200_region", "Top200".position AS "Top200_position", "Top200".track_name AS "Top200_track_name", "Top200".artist AS "Top200_artist", "Top200".streams AS "Top200_streams" 
FROM "Top200" 
WHERE "Top200".region IN (?)
2021-03-31 11:08:36,179 INFO sqlalchemy.engine.base.Engine ('global',)
2021-03-31 11:08:40,843 INFO sqlalchemy.engine.base.Engine ROLLBACK


## Number of songs in Top 200

### The Weeknd

In [140]:
plot_number_of_songs(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'The Weeknd',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

### Taco Hemingway

In [141]:
plot_number_of_songs(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Taco Hemingway',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

### Billie Eilish

In [142]:
plot_number_of_songs(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Billie Eilish',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

## Rank in Top 200

### The Weeknd

In [143]:
plot_max_rank(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'The Weeknd',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

### Taco Hemingway

In [144]:
plot_max_rank(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Taco Hemingway',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

### Billie Eilish

In [145]:
plot_max_rank(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Billie Eilish',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

# VIRAL 50

### The Weeknd

In [146]:
plot_max_rank(CHARTS['viral50'], date(2017,1,1), date(2021,3,20), 'The Weeknd',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

2021-03-31 11:09:35,903 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2021-03-31 11:09:35,905 INFO sqlalchemy.engine.base.Engine SELECT "Viral50".id AS "Viral50_id", "Viral50".spotify_id AS "Viral50_spotify_id", "Viral50".date AS "Viral50_date", "Viral50".region AS "Viral50_region", "Viral50".position AS "Viral50_position", "Viral50".track_name AS "Viral50_track_name", "Viral50".artist AS "Viral50_artist" 
FROM "Viral50" 
WHERE "Viral50".region IN (?)
2021-03-31 11:09:35,905 INFO sqlalchemy.engine.base.Engine ('pl',)
2021-03-31 11:09:40,243 INFO sqlalchemy.engine.base.Engine ROLLBACK
2021-03-31 11:09:40,451 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2021-03-31 11:09:40,452 INFO sqlalchemy.engine.base.Engine SELECT "Viral50".id AS "Viral50_id", "Viral50".spotify_id AS "Viral50_spotify_id", "Viral50".date AS "Viral50_date", "Viral50".region AS "Viral50_region", "Viral50".position AS "Viral50_position", "Viral50".track_name AS "Viral50_track_name", "Viral50".artist AS "Vira

### Taco Hemingway

In [147]:
plot_max_rank(CHARTS['viral50'], date(2017,1,1), date(2021,3,20), 'Taco Hemingway',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

### Billie Eilish

In [148]:
plot_max_rank(CHARTS['viral50'], date(2017,1,1), date(2021,3,20), 'Billie Eilish',
                   [Regions.PL, 
                    Regions.CZ, 
                    Regions.DE])

# Keywords

In [156]:
plot_total_streams_of_songs_with_keyword(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Christmas|Claus',
                   [Regions.GLOBAL, 
                    Regions.PL])

In [157]:
plot_total_streams_of_songs_with_keyword(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Holidays',
                   [Regions.GLOBAL, 
                    Regions.PL])

In [161]:
plot_total_streams_of_songs_with_keyword(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'Starboy',
                   [Regions.GLOBAL, 
                    Regions.PL])

## Audio characteristic

In [40]:
plot_average_value_of_audio_feature(CHARTS['top200'], date(2017,1,1), date(2021,3,20), 'danceability',
                   [Regions.PL])