In [271]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
from dash import Dash, html, dcc
from dash.dependencies import Input, Output
from plotly.offline import iplot , plot
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [272]:

# Load your dataset
df = pd.read_csv('dataset/Spotify-2000.csv')


In [273]:
user_data = pd.read_csv('dataset/spotify_archive/Spotify Quarterly.csv')

In [274]:
user_data.head()

Unnamed: 0,Date,Monthly Active User (MAU),Premium Users
0,Q1 2015,68,18
1,Q2 2015,77,22
2,Q3 2015,82,24
3,Q4 2015,91,28
4,Q1 2016,96,30


In [275]:
# Plot
fig = px.line(user_data, x='Date', y=['Monthly Active User (MAU)', 'Premium Users'], title='Spotify MAUs and Premium Users Over Quarters')
fig.update_layout(xaxis_title='Quarter', yaxis_title='Millions')
fig.show()

In [276]:
# Bar chart example
fig = px.bar(user_data, x='Date', y=['Monthly Active User (MAU)', 'Premium Users'], barmode='group', title='Spotify MAUs and Premium Users Over Quarters')
fig.update_layout(xaxis_title='Quarter', yaxis_title='Millions')
fig.show()

In [277]:
world_user = pd.read_csv('dataset/spotify_archive/World Users.csv')

In [278]:
world_user

Unnamed: 0,Year,Europe,North America,Latin America,Rest of World
0,2016,40,36,20,8
1,2017,51,46,28,13
2,2018,66,56,38,20
3,2019,83,65,49,35
4,2020,102,77,66,54
5,2021,124,88,80,73
6,2022,138,97,95,103
7,2023,159,111,116,165


In [279]:
# Creating a stacked area chart
fig = px.area(world_user, x='Year', y=['Europe', 'North America', 'Latin America', 'Rest of World'],
              title='Spotify Users by Region Over Years',
              labels={'value': 'Users (millions)', 'variable': 'Region'},
              facet_col_wrap=1)

# Update layout
fig.update_layout(xaxis_title='Year', yaxis_title='Number of Users (millions)')
fig.show()


In [280]:

spotify_2k = pd.read_csv('dataset/spotify_archive/Spotify-2000.csv')  # Make sure to adjust this to your dataset path


In [281]:
spotify_2k.head()

Unnamed: 0,Index,Title,Artist,Top Genre,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
0,1,Sunrise,Norah Jones,adult standards,2004,157,30,53,-14,11,68,201,94,3,71
1,2,Black Night,Deep Purple,album rock,2000,135,79,50,-11,17,81,207,17,7,39
2,3,Clint Eastwood,Gorillaz,alternative hip hop,2001,168,69,66,-9,7,52,341,2,17,69
3,4,The Pretender,Foo Fighters,alternative metal,2007,173,96,43,-4,3,37,269,0,4,76
4,5,Waitin' On A Sunny Day,Bruce Springsteen,classic rock,2002,106,82,58,-5,10,87,256,1,3,59


In [282]:
spotify_2k.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Index                   1994 non-null   int64 
 1   Title                   1994 non-null   object
 2   Artist                  1994 non-null   object
 3   Top Genre               1994 non-null   object
 4   Year                    1994 non-null   int64 
 5   Beats Per Minute (BPM)  1994 non-null   int64 
 6   Energy                  1994 non-null   int64 
 7   Danceability            1994 non-null   int64 
 8   Loudness (dB)           1994 non-null   int64 
 9   Liveness                1994 non-null   int64 
 10  Valence                 1994 non-null   int64 
 11  Length (Duration)       1994 non-null   object
 12  Acousticness            1994 non-null   int64 
 13  Speechiness             1994 non-null   int64 
 14  Popularity              1994 non-null   int64 
dtypes: i

In [283]:
numerical_stats = spotify_2k.describe()

numerical_stats.transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Index,1994.0,997.5,575.762538,1.0,499.25,997.5,1495.75,1994.0
Year,1994.0,1992.992979,16.116048,1956.0,1979.0,1993.0,2007.0,2019.0
Beats Per Minute (BPM),1994.0,120.215647,28.028096,37.0,99.0,119.0,136.0,206.0
Energy,1994.0,59.679539,22.154322,3.0,42.0,61.0,78.0,100.0
Danceability,1994.0,53.238215,15.351507,10.0,43.0,53.0,64.0,96.0
Loudness (dB),1994.0,-9.008526,3.647876,-27.0,-11.0,-8.0,-6.0,-2.0
Liveness,1994.0,19.012036,16.727378,2.0,9.0,12.0,23.0,99.0
Valence,1994.0,49.408726,24.858212,3.0,29.0,47.0,69.75,99.0
Acousticness,1994.0,28.858074,29.011986,0.0,3.0,18.0,50.0,99.0
Speechiness,1994.0,4.994985,4.401566,2.0,3.0,4.0,5.0,55.0


In [284]:
# Let's find out the top 10 artists with the most tracks in the dataset
top_artists = spotify_2k['Artist'].value_counts().head(10)

top_artists

Artist
Queen                 37
The Beatles           36
Coldplay              27
U2                    26
The Rolling Stones    24
Michael Jackson       23
Bruce Springsteen     23
ABBA                  22
David Bowie           21
Fleetwood Mac         18
Name: count, dtype: int64

In [285]:
top_artists_df = top_artists.reset_index()
top_artists_df.columns = ['Artist', 'Count']

# Create a bar chart
fig = px.bar(top_artists_df, x='Count', y='Artist', text='Count', orientation='h',
             title="2000's Top 10 Artists by Count",
             labels={'Count': 'Count', 'Artist': 'Artist'},
             color='Count',  # Color the bars by the count
             color_continuous_scale='Agsunset')

# Improve layout
fig.update_layout(
    xaxis_title="Count",
    yaxis_title="",
    coloraxis_showscale=True,  # Hide the color scale
    yaxis={'categoryorder':'total ascending'},  # Ensure the highest values are on top
    template='plotly_dark'
)

fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.show()

In [286]:
characteristics = ['Danceability', 'Energy', 'Loudness (dB)', 'Speechiness', 'Acousticness', 'Popularity', 'Liveness', 'Valence', 'Beats Per Minute (BPM)']


In [287]:
# Calculate the correlation matrix for a subset of the numerical variables
correlation_matrix = df[characteristics].corr()

correlation_matrix

Unnamed: 0,Danceability,Energy,Loudness (dB),Speechiness,Acousticness,Popularity,Liveness,Valence,Beats Per Minute (BPM)
Danceability,1.0,0.139616,0.044235,0.125229,-0.135769,0.144344,-0.103063,0.514564,-0.140602
Energy,0.139616,1.0,0.735711,0.205865,-0.665156,0.103393,0.174118,0.405175,0.156644
Loudness (dB),0.044235,0.735711,1.0,0.12509,-0.451635,0.165527,0.098257,0.147041,0.092927
Speechiness,0.125229,0.205865,0.12509,1.0,-0.098256,0.111689,0.092594,0.107102,0.085598
Acousticness,-0.135769,-0.665156,-0.451635,-0.098256,1.0,-0.087604,-0.046206,-0.239729,-0.122472
Popularity,0.144344,0.103393,0.165527,0.111689,-0.087604,1.0,-0.111978,0.095911,-0.003181
Liveness,-0.103063,0.174118,0.098257,0.092594,-0.046206,-0.111978,1.0,0.050667,0.016256
Valence,0.514564,0.405175,0.147041,0.107102,-0.239729,0.095911,0.050667,1.0,0.059653
Beats Per Minute (BPM),-0.140602,0.156644,0.092927,0.085598,-0.122472,-0.003181,0.016256,0.059653,1.0


In [288]:
# Generate text labels for the heatmap
text_labels = [[f'{val:.2f}' for val in row] for row in correlation_matrix.values]

# Create a heatmap with text labels inside each cell
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu_r',  # Change the colorscale to coolwarm
    zmid=0,  # Center the color scale at zero
    text=text_labels,
    texttemplate="%{text}",
    hoverinfo="text"
))

# Update layout for compact display
fig.update_layout(
    title='Correlation Matrix Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    xaxis=dict(tickmode='array', tickvals=np.arange(len(correlation_matrix.columns)), ticktext=correlation_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=np.arange(len(correlation_matrix.columns)), ticktext=correlation_matrix.columns),
    autosize=True,
    margin=dict(l=10, r=10, b=10, t=50)  # Reduce margins to make the plot more compact
)

# Show the figure
fig.show()

In [289]:
spotify_2k23 = pd.read_csv('dataset/spotify_archive/spotify-2023.csv',encoding='latin1')

In [290]:
spotify_2k23.head()


Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [291]:
spotify_2k23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

In [292]:
print(f"number of artists '{spotify_2k23['artist(s)_name'].nunique()}'")

number of artists '645'


In [293]:
artist_counts = spotify_2k23['artist(s)_name'].value_counts()
artist_counts

artist(s)_name
Taylor Swift                          34
The Weeknd                            22
Bad Bunny                             19
SZA                                   19
Harry Styles                          17
                                      ..
Gorillaz, Bad Bunny                    1
j-hope, J. Cole                        1
Robin Schulz, Oliver Tree              1
Don Toliver, Future, Justin Bieber     1
The Walters                            1
Name: count, Length: 645, dtype: int64

In [294]:
colors = ["#8c0404","#f25ed0","#000000","#16A085","#34495E",
           "#21618C ","#512E5F","#45B39D","#AAB7B8 ","#20B2AA",
           "#FF69B4","#00CED1","#FF7F50","#7FFF00","#DA70D6"]

In [295]:
# iplot(
px.bar(artist_counts[:15],
             text_auto = True,
             color = artist_counts[:15].index,
             color_discrete_sequence = colors, 
             labels=dict(index="Count Names",value="Count"),
             title = '2023 Top 15 Artists with Most Songs'
            )
            # )

In [296]:
spotify_2k23.columns

Index(['track_name', 'artist(s)_name', 'artist_count', 'released_year',
       'released_month', 'released_day', 'in_spotify_playlists',
       'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts',
       'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm',
       'key', 'mode', 'danceability_%', 'valence_%', 'energy_%',
       'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%'],
      dtype='object')

In [297]:
characteristics = ['in_spotify_playlists','danceability_%', 'energy_%', 'instrumentalness_%', 'speechiness_%', 'acousticness_%', 'liveness_%', 'valence_%', 'bpm']
correlation_matrix = spotify_2k23[characteristics].corr()


In [298]:

# Generate text labels for the heatmap
text_labels = [[f'{val:.2f}' for val in row] for row in correlation_matrix.values]

# Create a heatmap with text labels inside each cell
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu_r',  # Change the colorscale to coolwarm
    zmid=0,  # Center the color scale at zero
    text=text_labels,
    texttemplate="%{text}",
    hoverinfo="text"
))

# Update layout for compact display
fig.update_layout(
    title='Spotify 2023-Correlation Matrix Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    xaxis=dict(tickmode='array', tickvals=np.arange(len(correlation_matrix.columns)), ticktext=correlation_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=np.arange(len(correlation_matrix.columns)), ticktext=correlation_matrix.columns),
    autosize=True,
    margin=dict(l=10, r=10, b=10, t=50)  # Reduce margins to make the plot more compact
)

# Show the figure
fig.show()


In [299]:
# Convert 'Length (Duration)' from string to integer
# Remove commas and convert to seconds
spotify_2k['Length (Duration)'] = spotify_2k['Length (Duration)'].str.replace(',', '')
spotify_2k['Length (Duration)'] = pd.to_timedelta(spotify_2k['Length (Duration)']).dt.total_seconds()

# Convert to integer
spotify_2k['Length (Duration)'] = spotify_2k['Length (Duration)'].astype(int)

In [300]:

# Selecting features for clustering
features = spotify_2k[['Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Valence', 'Acousticness', 'Speechiness']]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [301]:

# Clustering
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=0)
spotify_2k['Cluster'] = kmeans.fit_predict(features_scaled)






In [302]:
# Reduce dimensions
pca = PCA(2)
features_reduced = pca.fit_transform(features_scaled)

# Create a DataFrame for the plot
plot_df = pd.DataFrame(features_reduced, columns=['PC1', 'PC2'])
plot_df['Cluster'] = spotify_2k['Cluster']

# Plotting
fig = px.scatter(plot_df, x='PC1', y='PC2', color='Cluster', title='Music Data Clustering (2D PCA)',
                 color_continuous_scale=px.colors.qualitative.D3)
fig.show()


In [303]:
# Reduce dimensions to 3D
pca_3d = PCA(3)
features_3d = pca_3d.fit_transform(features_scaled)

# Create a DataFrame for the plot
plot_df_3d = pd.DataFrame(features_3d, columns=['PC1', 'PC2', 'PC3'])
plot_df_3d['Cluster'] = spotify_2k['Cluster']

# Plotting 3D
fig_3d = px.scatter_3d(plot_df_3d, x='PC1', y='PC2', z='PC3', color='Cluster', title='Music Data Clustering (3D PCA)',
                       color_continuous_scale=px.colors.qualitative.D3)
fig_3d.show()
