In [1]:
import pathlib, warnings
warnings.filterwarnings('ignore')

import pandas as pd
import plotly.express as px
print(pd.__version__)


DATA_DIR = pathlib.Path('data')

2.2.1


In [52]:
IN_COLUMNS = [
    'in_spotify_playlists', 'in_spotify_charts',
    'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists',
    'in_deezer_charts', 'in_shazam_charts'
]
songs = (pd
    .read_csv(DATA_DIR / 'spotify-2023.csv', encoding='latin-1')
    .rename(columns={'released_day': 'day', 'released_month': 'month', 'released_year': 'year'})
    .astype({'key': 'category', 'mode': 'category'})
)
songs.insert(3, 'released_date', pd.to_datetime(songs[['year', 'month', 'day']]))
songs_charts = songs[IN_COLUMNS]
songs = songs.drop(columns=['day', 'month', 'year'] + IN_COLUMNS)
songs.columns = [col.replace('(', '').replace(')', '').replace('_%', '') for col in songs.columns]
songs['streams'] = pd.to_numeric(songs['streams'], errors='coerce').fillna(0).astype(int)

In [53]:
songs.columns

Index(['track_name', 'artists_name', 'artist_count', 'released_date',
       'streams', 'bpm', 'key', 'mode', 'danceability', 'valence', 'energy',
       'acousticness', 'instrumentalness', 'liveness', 'speechiness'],
      dtype='object')

In [54]:
songs.head()

Unnamed: 0,track_name,artists_name,artist_count,released_date,streams,bpm,key,mode,danceability,valence,energy,acousticness,instrumentalness,liveness,speechiness
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023-07-14,141381703,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023-03-23,133716286,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023-06-30,140003974,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019-08-23,800840817,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023-05-18,303236322,144,A,Minor,65,23,80,14,63,11,6


In [5]:
songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   track_name        953 non-null    object        
 1   artists_name      953 non-null    object        
 2   artist_count      953 non-null    int64         
 3   released_date     953 non-null    datetime64[ns]
 4   streams           953 non-null    int32         
 5   bpm               953 non-null    int64         
 6   key               858 non-null    category      
 7   mode              953 non-null    category      
 8   danceability      953 non-null    int64         
 9   valence           953 non-null    int64         
 10  energy            953 non-null    int64         
 11  acousticness      953 non-null    int64         
 12  instrumentalness  953 non-null    int64         
 13  liveness          953 non-null    int64         
 14  speechiness       953 non-

In [6]:
songs.describe()

Unnamed: 0,artist_count,released_date,streams,bpm,danceability,valence,energy,acousticness,instrumentalness,liveness,speechiness
count,953.0,953,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0
mean,1.556139,2018-09-10 05:09:45.519412480,389977600.0,122.540399,66.96957,51.43127,64.279119,27.057712,1.581322,18.213012,10.131165
min,1.0,1930-01-01 00:00:00,-2147484000.0,65.0,23.0,4.0,9.0,0.0,0.0,3.0,2.0
25%,1.0,2020-06-28 00:00:00,130655800.0,100.0,57.0,32.0,53.0,6.0,0.0,10.0,4.0
50%,1.0,2022-04-08 00:00:00,267758500.0,121.0,69.0,51.0,66.0,18.0,0.0,12.0,6.0
75%,2.0,2022-11-04 00:00:00,607123800.0,140.0,78.0,70.0,77.0,43.0,0.0,24.0,11.0
max,8.0,2023-07-14 00:00:00,2135158000.0,206.0,96.0,97.0,97.0,97.0,91.0,97.0,64.0
std,0.893044,,614566400.0,28.057802,14.63061,23.480632,16.550526,25.996077,8.4098,13.711223,9.912888


# Analysis

In [34]:
CHARACTERISTICS = [
    'danceability', 'valence',
    'energy', 'acousticness',
    'instrumentalness', 'liveness',
    'speechiness',
]

In [35]:
characteristics_by_key_bars = px.bar(
    (songs
        .groupby('key')
        [CHARACTERISTICS]
        .mean()
    ), 
    barmode='group',
    title='Characteristics by Key',
    labels={'value': 'Mean Value', 'key': 'Key', 'variable': 'Characteristic'},
    template='plotly_dark'
)
characteristics_by_key_bars.update_layout(legend=dict(orientation='h',yanchor='top',y=1.15,xanchor='right',x=1))
characteristics_by_key_bars.show()

In [40]:
characteristics_by_mode_bars = px.bar(
    (songs
        .groupby(['mode'])
        [CHARACTERISTICS]
        .mean()
        .transpose()
    ), 
    barmode='group',
    title='Characteristics by Mode',
    labels={'value': 'Mean Value', 'index': 'Characteristic'},
    template='plotly_dark'
)
characteristics_by_mode_bars.update_layout(legend=dict(orientation='h',yanchor='top',y=1.15,xanchor='right',x=1))
characteristics_by_mode_bars.show()

In [50]:
streams_bpm_by_year = (
    songs
    .groupby(songs.released_date.dt.year)
    [['streams', 'bpm']]
    .mean()
)

In [56]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=streams_bpm_by_year.index, y=streams_bpm_by_year['streams'], name="Streams"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=streams_bpm_by_year.index, y=streams_bpm_by_year['bpm'], name="BPM"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Double Y Axis Example",
    template='plotly_dark'
)

# Set x-axis title
fig.update_xaxes(title_text="xaxis title")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> yaxis title", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> yaxis title", secondary_y=True)

fig.show()

In [60]:
(
    songs
    .join(songs_charts)
    .groupby(songs.released_date.dt.year)
    [IN_COLUMNS]
    .mean()
)

TypeError: agg function failed [how->mean,dtype->object]

In [66]:
songs_charts.in_deezer_playlists.str.replace(',', '').astype(int)

0       45
1       58
2       91
3      125
4       87
      ... 
948     37
949      8
950      7
951     17
952     32
Name: in_deezer_playlists, Length: 953, dtype: int32

In [75]:
songs_charts.in_shazam_charts.str.replace(',', '').astype(int, errors='coerce')

ValueError: Expected value of kwarg 'errors' to be one of ['raise', 'ignore']. Supplied value is 'coerce'