In [1]:
# Importing the dataset as a dataframe

import pandas as pd
import numpy as np

df = pd.read_csv('../dataset/spotify_most_streamed_songs.csv')

# Viewing first 50 lines of dataset to see what it looks like
print(df.head(50).to_string())

                                                                                       track_name                                                                artist(s)_name  artist_count  released_year  released_month  released_day  in_spotify_playlists  in_spotify_charts     streams  in_apple_playlists  in_apple_charts in_deezer_playlists  in_deezer_charts in_shazam_charts  bpm  key   mode  danceability_%  valence_%  energy_%  acousticness_%  instrumentalness_%  liveness_%  speechiness_%                                                         cover_url
0                                                             Seven (feat. Latto) (Explicit Ver.)                                                              Latto, Jung Kook             2           2023               7            14                   553                147   141381703                  43              263                  45                10              826  125    B  Major              80         89        83    

In [2]:
# Reviewing high level stats of the dataset

print(df.describe().to_string())
print('\n')
print(df.info())

       artist_count  released_year  released_month  released_day  in_spotify_playlists  in_spotify_charts  in_apple_playlists  in_apple_charts  in_deezer_charts         bpm  danceability_%   valence_%    energy_%  acousticness_%  instrumentalness_%  liveness_%  speechiness_%
count    953.000000     953.000000      953.000000    953.000000            953.000000         953.000000          953.000000       953.000000        953.000000  953.000000       953.00000  953.000000  953.000000      953.000000          953.000000  953.000000     953.000000
mean       1.556139    2018.238195        6.033578     13.930745           5200.124869          12.009444           67.812172        51.908709          2.666317  122.540399        66.96957   51.431270   64.279119       27.057712            1.581322   18.213012      10.131165
std        0.893044      11.116218        3.566435      9.201949           7897.608990          19.575992           86.441493        50.630241          6.035599   28.057802

Initial Notes
- 953 unique songs
- Song metrics rated on scale 0% to 100%
- Spotify seems to be a more widely used platform for top songs
- Most artists show up only once in this list
- The dataset is fairly clean with no non-null values in any column

In [3]:
# Who are the top 10 artists that appear in the most streamed dataset? How do their song characteristics look compared to the rest of the dataset?

top_10 = df.value_counts('artist(s)_name').head(10)
top_10_mask = df['artist(s)_name'].isin(top_10.index)
remaining_mask = ~top_10_mask

print('Top 10')
print(df[top_10_mask][['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].mean())
print('\n')
print('Remaining')
print(df[remaining_mask][['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].mean())

Top 10
bpm                   123.056604
danceability_%         63.220126
valence_%              46.698113
energy_%               60.893082
acousticness_%         28.949686
instrumentalness_%      1.150943
liveness_%             17.911950
speechiness_%           9.352201
dtype: float64


Remaining
bpm                   122.437028
danceability_%         67.720403
valence_%              52.379093
energy_%               64.957179
acousticness_%         26.678841
instrumentalness_%      1.667506
liveness_%             18.273300
speechiness_%          10.287154
dtype: float64


In [4]:
# How do top streamed songs trend over time?

df.groupby('released_year')['track_name'].count().sort_values(ascending=False)

released_year
2022    402
2023    175
2021    119
2020     37
2019     36
2017     23
2016     18
2013     13
2014     13
2015     11
2018     10
2011     10
2012     10
2010      7
2002      6
1999      5
2004      4
2000      4
1984      4
1963      3
1958      3
1995      2
2003      2
2008      2
1970      2
1957      2
1986      2
1985      2
1982      2
1959      2
1975      2
1991      2
2007      1
1930      1
2005      1
1998      1
1997      1
1942      1
1994      1
1992      1
1987      1
1983      1
1979      1
1973      1
1971      1
1968      1
1952      1
1950      1
1946      1
1996      1
Name: track_name, dtype: int64

- There is a higher concentration of top streaming sings in the 2000s, which might make sense given that music streaming apps have become more relevant over the last decade. This isn't to say songs released prior to the 2000s cannot be streamed, but 

In [5]:
# What key has the most songs written in. What are the characteristics of the songs in this group and how does it compare to the remainder of the dataset?

top_key = df.groupby('key')['track_name'].count().sort_values(ascending=False).head(1)
top_key_mask = df['key'].isin(top_key.index)
remainder_mask = ~df['key'].isin(top_key.index)

print(top_key)
print(df[top_key_mask][['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].mean())
print(df[remainder_mask][['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].mean())

key
C#    120
Name: track_name, dtype: int64
bpm                   122.341667
danceability_%         68.641667
valence_%              49.791667
energy_%               66.550000
acousticness_%         21.066667
instrumentalness_%      1.241667
liveness_%             18.241667
speechiness_%          12.066667
dtype: float64
bpm                   122.569028
danceability_%         66.728691
valence_%              51.667467
energy_%               63.951981
acousticness_%         27.920768
instrumentalness_%      1.630252
liveness_%             18.208884
speechiness_%           9.852341
dtype: float64


In [8]:
df['streams'] = pd.to_numeric(df['streams'], 'coerce')
streams_by_charts = df.groupby(['in_spotify_playlists'])[['streams']].mean().sort_values(by='streams', ascending=False)
pd.set_option('display.max_rows', 150)
(streams_by_charts/1000000).round(2)

df['streams'] = pd.to_numeric(df['streams'], 'coerce')
streams_by_charts = df.groupby(['in_spotify_charts'])[['streams']].mean().sort_values(by='streams', ascending=False)
pd.set_option('display.max_rows', 150)
(streams_by_charts/1000000).round(2)



Unnamed: 0_level_0,streams
in_spotify_charts,Unnamed: 1_level_1
79,2565.53
63,2557.98
130,2513.19
69,1890.33
61,1762.17
62,1592.91
52,1514.13
53,1469.56
78,1415.34
43,1352.81
