In [9]:
# Importing the dataset as a dataframe

import pandas as pd
import numpy as np

df = pd.read_csv('../dataset/spotify_most_streamed_songs.csv')

# Viewing first 50 lines of dataset to see what it looks like
print(df.head(25).to_string())

                                                                                       track_name                            artist(s)_name  artist_count  released_year  released_month  released_day  in_spotify_playlists  in_spotify_charts     streams  in_apple_playlists  in_apple_charts in_deezer_playlists  in_deezer_charts in_shazam_charts  bpm  key   mode  danceability_%  valence_%  energy_%  acousticness_%  instrumentalness_%  liveness_%  speechiness_%                                                         cover_url
0                                                             Seven (feat. Latto) (Explicit Ver.)                          Latto, Jung Kook             2           2023               7            14                   553                147   141381703                  43              263                  45                10              826  125    B  Major              80         89        83              31                   0           8              4             

In [2]:
# Reviewing high level stats of the dataset

print(df.describe().to_string())
print('\n')
print(df.info())

       artist_count  released_year  released_month  released_day  in_spotify_playlists  in_spotify_charts  in_apple_playlists  in_apple_charts  in_deezer_charts         bpm  danceability_%   valence_%    energy_%  acousticness_%  instrumentalness_%  liveness_%  speechiness_%
count    953.000000     953.000000      953.000000    953.000000            953.000000         953.000000          953.000000       953.000000        953.000000  953.000000       953.00000  953.000000  953.000000      953.000000          953.000000  953.000000     953.000000
mean       1.556139    2018.238195        6.033578     13.930745           5200.124869          12.009444           67.812172        51.908709          2.666317  122.540399        66.96957   51.431270   64.279119       27.057712            1.581322   18.213012      10.131165
std        0.893044      11.116218        3.566435      9.201949           7897.608990          19.575992           86.441493        50.630241          6.035599   28.057802

**Dataset Notes:** 
- Data is at the song level
- Track details are included (release year, month, day) along with playlist/chart prevalance
- Song details include key, major or minor, and song characteristics ranked on a scale of 0% to 100%
- 953 unique songs
- The dataset is fairly clean with no non-null values in any column though there may be individual cell level nuances

In [None]:
# Who are the top 10 artists that appear in the most streamed dataset? How do their song characteristics look compared to the rest of the dataset?

top_10 = df.value_counts('artist(s)_name').head(10)
top_10_mask = df['artist(s)_name'].isin(top_10.index)
remaining_mask = ~top_10_mask

print(top_10)
print('\n')
print('Top 10')
print(df[top_10_mask][['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].mean())
print('\n')
print('Remaining')
print(df[remaining_mask][['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].mean())

artist(s)_name
Taylor Swift        34
The Weeknd          22
SZA                 19
Bad Bunny           19
Harry Styles        17
Kendrick Lamar      12
Morgan Wallen       11
Ed Sheeran           9
Drake, 21 Savage     8
BTS                  8
Name: count, dtype: int64


Top 10
bpm                   123.056604
danceability_%         63.220126
valence_%              46.698113
energy_%               60.893082
acousticness_%         28.949686
instrumentalness_%      1.150943
liveness_%             17.911950
speechiness_%           9.352201
dtype: float64


AttributeError: 'NoneType' object has no attribute 'round'

**Top 10 Artists:** 
- Taylor Swift is the clear leader in most streamed songs showing up 34 times in the dataset
- All of the top 10 artists show up in the dataset multiple times, suggesting streaming success across multiple releases
- Looking at the average song characteristics between the Top 10 artists and the remaining dataset, there isn't a clear feature that separates the group at the moment
    - This may be down to different genres being averaged together or the Remaining group having a larger selection (and potentially variety) of songs

In [4]:
# How do top streamed songs trend over time?

df.groupby('released_year')['track_name'].count().sort_values(ascending=False)

released_year
2022    402
2023    175
2021    119
2020     37
2019     36
2017     23
2016     18
2013     13
2014     13
2015     11
2018     10
2011     10
2012     10
2010      7
2002      6
1999      5
2004      4
2000      4
1984      4
1963      3
1958      3
1995      2
2003      2
2008      2
1970      2
1957      2
1986      2
1985      2
1982      2
1959      2
1975      2
1991      2
2007      1
1930      1
2005      1
1998      1
1997      1
1942      1
1994      1
1992      1
1987      1
1983      1
1979      1
1973      1
1971      1
1968      1
1952      1
1950      1
1946      1
1996      1
Name: track_name, dtype: int64

- There is a higher concentration of top streaming sings in the 2000s, which might make sense given that music streaming apps have become more relevant over the last decade. This isn't to say songs released prior to the 2000s cannot be streamed, but 

In [None]:
# What key has the most songs written in? What are the characteristics of the songs in this group and how does it compare to the remainder of the dataset?

top_key = df.groupby('key')['track_name'].count().sort_values(ascending=False).head(1)
top_key_mask = df['key'].isin(top_key.index)
remainder_mask = ~df['key'].isin(top_key.index)

print(top_key)

print('\n')

top = df[top_key_mask][['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].mean()
print(top.round(2))

print('\n')

remaining = df[remainder_mask][['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].mean()
print(remaining.round(2))

key
C#    120
Name: track_name, dtype: int64


bpm                   122.34
danceability_%         68.64
valence_%              49.79
energy_%               66.55
acousticness_%         21.07
instrumentalness_%      1.24
liveness_%             18.24
speechiness_%          12.07
dtype: float64


bpm                   122.57
danceability_%         66.73
valence_%              51.67
energy_%               63.95
acousticness_%         27.92
instrumentalness_%      1.63
liveness_%             18.21
speechiness_%           9.85
dtype: float64


**Song Key Details:**
- The most popular key for songs that were streamed was C#
- Though not significantly different, songs in C# were slightly more energetic and more speechy. Acousticness and instrumentalness were also lower, suggesting listener preference for a more natural sound

In [None]:
df['streams'] = pd.to_numeric(df['streams'], 'coerce')
streams_by_charts = df.groupby(['in_spotify_charts'])[['streams']].mean().sort_values(by='streams', ascending=False)
(streams_by_charts/1000000).round(2).head(25)


Unnamed: 0_level_0,streams
in_spotify_charts,Unnamed: 1_level_1
79,2565.53
63,2557.98
130,2513.19
69,1890.33
61,1762.17
62,1592.91
52,1514.13
53,1469.56
78,1415.34
43,1352.81


**Streams vs. Charts:**
- At the moment, there isn't a clear correlation with being in more charts resulting in more streams
- There are certainly instances where the two trend together but there are also songs that are on many charts but not ranked higher in average streams