In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('data/spotify_songs.csv')
data = data.dropna()
data = data.drop_duplicates(subset=['track_id', 'track_album_id', 'playlist_id'], inplace=False)


In [45]:
# coefficient de corrélation de Pearson
correlation = data.corr()
display(correlation)





Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
track_popularity,1.0,0.063205,-0.100927,-0.001215,0.060652,0.011472,0.006616,0.080163,-0.144952,-0.053824,0.032627,-0.001184,-0.139874
danceability,0.063205,1.0,-0.087066,0.011085,0.024067,-0.057399,0.182579,-0.023502,-0.008674,-0.125154,0.331114,-0.185787,-0.096966
energy,-0.100927,-0.087066,1.0,0.008993,0.676399,-0.004457,-0.032831,-0.537727,0.028846,0.162127,0.15188,0.148974,0.011363
key,-0.001215,0.011085,0.008993,1.0,-9.5e-05,-0.173824,0.02371,0.005825,0.006896,0.002264,0.019577,-0.013402,0.016318
loudness,0.060652,0.024067,0.676399,-9.5e-05,1.0,-0.019497,0.010653,-0.360097,-0.152988,0.07842,0.053437,0.093983,-0.115616
mode,0.011472,-0.057399,-0.004457,-0.173824,-0.019497,1.0,-0.062482,0.007904,-0.006304,-0.00493,0.001738,0.015205,0.015893
speechiness,0.006616,0.182579,-0.032831,0.02371,0.010653,-0.062482,1.0,0.027149,-0.103488,0.055838,0.063381,0.04451,-0.091198
acousticness,0.080163,-0.023502,-0.537727,0.005825,-0.360097,0.007904,0.027149,1.0,-0.004112,-0.076542,-0.01593,-0.111733,-0.082011
instrumentalness,-0.144952,-0.008674,0.028846,0.006896,-0.152988,-0.006304,-0.103488,-0.004112,1.0,-0.005447,-0.173283,0.022098,0.058145
liveness,-0.053824,-0.125154,0.162127,0.002264,0.07842,-0.00493,0.055838,-0.076542,-0.005447,1.0,-0.021674,0.0203,0.00706


In [46]:
from plotly import express as px

# représentation graphique de la matrice de corrélation avec une echelle de -1 à 1
fig = px.imshow(correlation, zmin=-1, zmax=1)
fig.update_layout(
    width=800,
    height=800,
    coloraxis=dict(
        colorbar=dict(
            tickvals=[-1, -0.66, -0.33, 0, 0.33, 0.66, 1],
            ticktext=['-1', '-0.66', '-0.33', '0', '0.33', '0.66', '1']
        )
    )
)
fig.show()

In [47]:
# lister les variable qualitatives
qualitative = data.select_dtypes(include=['object'])
display(qualitative.columns)

Index(['track_id', 'track_name', 'track_artist', 'track_album_id',
       'track_album_name', 'track_album_release_date', 'playlist_name',
       'playlist_id', 'playlist_genre', 'playlist_subgenre'],
      dtype='object')

In [48]:
# garder uniquement playlist_genre, playlist_subgenre et track_popularity
df2 = data[['playlist_genre', 'playlist_subgenre', 'track_popularity']]
display(df2)

Unnamed: 0,playlist_genre,playlist_subgenre,track_popularity
0,pop,dance pop,66
1,pop,dance pop,67
2,pop,dance pop,70
3,pop,dance pop,60
4,pop,dance pop,69
...,...,...,...
32828,edm,progressive electro house,42
32829,edm,progressive electro house,20
32830,edm,progressive electro house,14
32831,edm,progressive electro house,15


In [49]:
dfgenre = df2[['playlist_genre', 'track_popularity']].groupby('playlist_genre').mean()
dfsubgenre = df2[['playlist_subgenre', 'track_popularity']].groupby('playlist_subgenre').mean()
display(dfgenre.sort_values(by='track_popularity', ascending=False))
display('-'*50)
display(dfsubgenre.sort_values(by='track_popularity', ascending=False))

Unnamed: 0_level_0,track_popularity
playlist_genre,Unnamed: 1_level_1
pop,47.74487
latin,46.332805
rap,43.238029
rock,41.971845
r&b,40.480583
edm,34.56166


'--------------------------------------------------'

Unnamed: 0_level_0,track_popularity
playlist_subgenre,Unnamed: 1_level_1
post-teen pop,56.825509
permanent wave,54.000905
hip hop,53.773071
reggaeton,52.876874
dance pop,52.079353
hip pop,52.029891
latin pop,51.099842
urban contemporary,50.523843
trap,50.308288
pop edm,44.045785


In [52]:
fig = px.bar(dfgenre.sort_values(by='track_popularity', ascending=False), 
             x=dfgenre.index, 
             y='track_popularity', 
             title='Popularité moyenne des genres de playlist')
fig.update_layout(xaxis_title='genre musicaux', yaxis_title='Popularité moyenne', yaxis=dict(range=[0, 100]), width=600, height=400)

# Ajouter une ligne horizontale pour la moyenne globale
fig.add_shape(type='line',
              x0=-0.5, x1=len(dfgenre)-0.5,
              y0=dfgenre['track_popularity'].mean(), y1=dfgenre['track_popularity'].mean(),
              line=dict(color='Red', dash='dash'))

fig.show()


In [53]:
fig = px.bar(dfsubgenre.sort_values(by='track_popularity', ascending=False), 
             x=dfsubgenre.index, 
             y='track_popularity', 
             title='Popularité moyenne des sous-genres de playlist')
fig.update_layout(xaxis_title='sous genres musicaux', yaxis_title='Popularité moyenne', yaxis=dict(range=[0, 100]), width=1000, height=400)
# Ajouter une ligne horizontale pour la moyenne globale
fig.add_shape(type='line',
              x0=-0.5, x1=len(dfsubgenre)-0.5,
              y0=dfsubgenre['track_popularity'].mean(), y1=dfsubgenre['track_popularity'].mean(),
              line=dict(color='Red', dash='dash'))
fig.show()