## Introduction

Construire un pipeline ETL complet pour récupérer les données de l'API Spotify, les stocker dans PostgreSQL et produire des visualisations pour analyser la popularité des artistes et des titres. 

Spotify est une plateforme de streaming musical et de podcasts proposant de nombreux artistes, leurs albums et titres respectifs. Chaque artiste et titre possède une cote de popularité, et cet exercice est centré sur l'analyse des facteurs qui influencent cette mesure.

## Installation et importation des librairies

In [138]:
#import libraries
import requests
import numpy as np
import pandas as pd
import psycopg2 as ps
import datetime
import base64
import time
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine

## Authentification à l’API Spotify

In [139]:
def authentication():
    
    ############ CLIENT ID AND CLIENT SECRET ################
    
    client_id = 'f78774ce7ff5423d9e6d163e4d6519fc'
    client_secret = '7f5d790f3c3b4c1b81e3ee5eea035138'
    client_creds = f"{client_id}:{client_secret}"
    client_creds_b64 = base64.b64encode(client_creds.encode())
    
    ############## API CALL PARAMETERS ######################
    
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_data = {
        'grant_type': 'client_credentials'
    }
    auth_headers = {
        'Authorization': f'Basic {client_creds_b64.decode()}'
    }
    
    ############ API CALL ###################################
    
    r = requests.post(auth_url, data=auth_data, headers=auth_headers)
    request_status = r.status_code in range(200,299)
    
    ########################################################
    
    if request_status == True:
        auth_response = r.json()
        now = datetime.datetime.now()
        access_token = auth_response['access_token']
        expires_in = auth_response['expires_in']
        expires = now + datetime.timedelta(seconds=expires_in)
        
    return access_token, expires

In [140]:
token, expires = authentication()
print(token)
print(expires)

BQDHj3iDTSefNtzlRBx6rIHlvlejODJenq7V_yeWD9bTUTcD2yPzpBp9bZMHcVfh0Jf2vB8-PzF6u58SY03seK7SUyjD-0nSPGHbfQZeQoQWKp0AqXfRolr42QBEYL_Jf7hu5a99dLY
2025-04-19 04:11:06.711475


In [141]:
artist_list=['Youssou Ndour', 'Baba Maal', 'Orchestra Baobab', 'Ismaila Lo', 'Cheikh Lo', 'Viviane Chidid', 'Waly Seck', ]
len(artist_list)

7

In [142]:
artists = artist_list

In [143]:
class Artist_breakdown():
    
    def __init__(self, artist_name, token):
        self.artist_name = artist_name
        self.token = token

        self.genre = None
        self.artist_id = None
        self.artist_popularity = None
        self.album_data = []
        
        self.extract_artist_data()
        self.extract_albums_tracks()

    def headers(self):
        return {'Authorization': f'Bearer {self.token}'}

    def extract_artist_data(self):
        url = 'https://api.spotify.com/v1/search'
        params = {'q': self.artist_name, 'type':'artist'}
        response = requests.get(url, headers=self.headers(), params=params).json()
        items = response.get('artists', {}).get('items', [])
        if not items:
            raise ValueError("Artiste non trouvé.")

        first_item = items[0]
        self.genre = first_item['genres'][0] if first_item['genres'] else "Inconnu"
        self.artist_id = first_item['id']
        self.artist_name = first_item['name']
        self.artist_popularity = first_item['popularity']

    def extract_albums_tracks(self):
        url = f'https://api.spotify.com/v1/artists/{self.artist_id}/albums'
        response = requests.get(url, headers=self.headers()).json()

        seen_albums = set()
        for album in response['items']:
            if album['name'] not in seen_albums:
                seen_albums.add(album['name'])
                album_id = album['id']
                album_tracks = self.extract_tracks(album_id)

                self.album_data.extend([{
                    'artist_name': self.artist_name,
                    'artist_id': self.artist_id,
                    'artist_popularity': self.artist_popularity,
                    'genre': self.genre,
                    'album_name': album['name'],
                    'album_id': album_id,
                    'album_type': album['type'],
                    'total_tracks': album['total_tracks'],
                    'release_date': album['release_date'],
                    'available_markets': len(album['available_markets']),
                    'track_title': track['name'],
                    'track_id': track['id'],
                    'duration_ms': track['duration_ms'],
                    'track_popularity': self.extract_track_popularity(track['id'])
                } for track in album_tracks])
                time.sleep(0.2)

    def extract_tracks(self, album_id):
        url = f'https://api.spotify.com/v1/albums/{album_id}/tracks'
        response = requests.get(url, headers=self.headers()).json()
        return response['items']

    def extract_track_popularity(self, track_id):
        url = f'https://api.spotify.com/v1/tracks/{track_id}'
        response = requests.get(url, headers=self.headers()).json()
        return response.get('popularity', 0)

    def to_dataframe(self):
        return pd.DataFrame(self.album_data)


In [144]:
artist_list = ['Youssou Ndour', 'Baba Maal', 'Orchestra Baobab', 'Ismaila Lo', 'Cheikh Lo', 'Viviane Chidid', 'Waly Seck',
]

all_dataframes = []

for artist in artist_list:
    try:
        artist_analysis = Artist_breakdown(artist, token)
        df = artist_analysis.to_dataframe()
        all_dataframes.append(df)
        print(f"✅ Données OK pour : {artist}")
    except Exception as e:
        print(f"❌ Erreur pour {artist} : {e}")

# Fusion finale
df_spotify = pd.concat(all_dataframes, ignore_index=True)

✅ Données OK pour : Youssou Ndour
✅ Données OK pour : Baba Maal
✅ Données OK pour : Orchestra Baobab
✅ Données OK pour : Ismaila Lo
✅ Données OK pour : Cheikh Lo
✅ Données OK pour : Viviane Chidid
✅ Données OK pour : Waly Seck


In [145]:
df_spotify

Unnamed: 0,artist_name,artist_id,artist_popularity,genre,album_name,album_id,album_type,total_tracks,release_date,available_markets,track_title,track_id,duration_ms,track_popularity
0,Youssou N'Dour,77zlytAFjPFjUKda8TNIDY,55,afropop,Eclairer le monde - Light the World,7M2i3utcB8cUT9PIDI6Px1,album,12,2025-04-04,185,Tout pour briller,2T3LcFy7lWHRohhbG4cJnY,210957,33
1,Youssou N'Dour,77zlytAFjPFjUKda8TNIDY,55,afropop,Eclairer le monde - Light the World,7M2i3utcB8cUT9PIDI6Px1,album,12,2025-04-04,185,Sa ma habiibi,5PWUVKzUVx49MpQwv6ngJH,215640,31
2,Youssou N'Dour,77zlytAFjPFjUKda8TNIDY,55,afropop,Eclairer le monde - Light the World,7M2i3utcB8cUT9PIDI6Px1,album,12,2025-04-04,185,Tell Me What You Want,7CDQD8mNhMdvxHuwfE1gbW,269788,36
3,Youssou N'Dour,77zlytAFjPFjUKda8TNIDY,55,afropop,Eclairer le monde - Light the World,7M2i3utcB8cUT9PIDI6Px1,album,12,2025-04-04,185,Bul ma laaj,3IXaHlYXwpr89sa0cqGaum,191903,32
4,Youssou N'Dour,77zlytAFjPFjUKda8TNIDY,55,afropop,Eclairer le monde - Light the World,7M2i3utcB8cUT9PIDI6Px1,album,12,2025-04-04,185,Noflaay,6e04ppXc55WfiqmCgdCsO8,208998,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
922,Waly Seck,7MZV4o3PNnjhTxmIYDReFm,2,Inconnu,Màndu (Remixes),3uZ2YQTRsA9ffpi54zwn2i,album,13,2016-01-17,185,Nawle - Remix,0kGRMWij5QgvknKrKiItll,395316,1
923,Waly Seck,7MZV4o3PNnjhTxmIYDReFm,2,Inconnu,Màndu (Remixes),3uZ2YQTRsA9ffpi54zwn2i,album,13,2016-01-17,185,Arwatam - Remix,1DvjoDePPnVTPpyLCaDPVy,408994,3
924,Waly Seck,7MZV4o3PNnjhTxmIYDReFm,2,Inconnu,Màndu (Remixes),3uZ2YQTRsA9ffpi54zwn2i,album,13,2016-01-17,185,Aqq - Remix,1o21ls4lTpEwIQXnbBnxwD,426318,1
925,Waly Seck,7MZV4o3PNnjhTxmIYDReFm,2,Inconnu,Màndu (Remixes),3uZ2YQTRsA9ffpi54zwn2i,album,13,2016-01-17,185,Confiance - Remix,52MncQDmq92MwBZYSQyO93,524892,3


In [146]:
dat = df_spotify.copy()

In [147]:
dat.dtypes

artist_name          object
artist_id            object
artist_popularity     int64
genre                object
album_name           object
album_id             object
album_type           object
total_tracks          int64
release_date         object
available_markets     int64
track_title          object
track_id             object
duration_ms           int64
track_popularity      int64
dtype: object

In [148]:
# Extraction  d'un sous-DataFrame nommé artist à partir du DataFrame dat
artist = dat[['artist_name', 'artist_id', 'genre', 'artist_popularity']]
artist = artist.reindex(columns=['artist_id', 'artist_name', 'genre', 'artist_popularity'])
artist.drop_duplicates(subset=['artist_id'], inplace=True)
artist.reset_index(drop=True, inplace=True)
artist.head()

Unnamed: 0,artist_id,artist_name,genre,artist_popularity
0,77zlytAFjPFjUKda8TNIDY,Youssou N'Dour,afropop,55
1,57jz8z4r0s4eIDrS1r3dD9,Baba Maal,Inconnu,3
2,7xT0arvCDupDU1YYy0BNv9,Orchestra Baobab,afropop,44
3,4EBRPnSYtrfmwptEMOSCV8,Ismaël Lô,afropop,42
4,6CFWXwqEBUi0UFoIIxmg9h,Cheikh Lô,afropop,41


In [149]:
albums = dat[['artist_id','album_name', 'release_date', 'album_id', 
             'total_tracks', 'available_markets', 'album_type']]
albums = albums.reindex(columns=['album_id', 'album_name', 'artist_id', 
            'release_date', 'total_tracks', 'available_markets', 'album_type'])
albums.drop_duplicates(subset=['album_id'], inplace=True)
albums.reset_index(drop=True, inplace=True)
albums.head()

Unnamed: 0,album_id,album_name,artist_id,release_date,total_tracks,available_markets,album_type
0,7M2i3utcB8cUT9PIDI6Px1,Eclairer le monde - Light the World,77zlytAFjPFjUKda8TNIDY,2025-04-04,12,185,album
1,4SkTPpvkyHKBdOcuPws0op,MBALAX,77zlytAFjPFjUKda8TNIDY,2021-11-12,12,184,album
2,3JcnB71P2ePmk4f2bMwzZv,History,77zlytAFjPFjUKda8TNIDY,2019-04-26,10,172,album
3,4Vu6er2mHxpHoJrgZNhOdc,"Le Grand Bal 2017, Vol. 2 (Live)",77zlytAFjPFjUKda8TNIDY,2018-01-03,7,185,album
4,4l8gdczMyeRU4BHIiwCPvw,"Le Grand Bal 2017, Vol. 1 (Live)",77zlytAFjPFjUKda8TNIDY,2018-01-03,9,185,album


In [150]:
tracks = dat[['track_id', 'track_title', 'duration_ms', 'album_id', 'track_popularity']]
tracks = tracks.reindex(columns=['track_id', 'track_title', 'album_id', 'duration_ms', 'track_popularity'])
tracks.drop_duplicates(subset=['track_id'], inplace=True)
tracks.reset_index(drop=True, inplace=True)
tracks.head()

Unnamed: 0,track_id,track_title,album_id,duration_ms,track_popularity
0,2T3LcFy7lWHRohhbG4cJnY,Tout pour briller,7M2i3utcB8cUT9PIDI6Px1,210957,33
1,5PWUVKzUVx49MpQwv6ngJH,Sa ma habiibi,7M2i3utcB8cUT9PIDI6Px1,215640,31
2,7CDQD8mNhMdvxHuwfE1gbW,Tell Me What You Want,7M2i3utcB8cUT9PIDI6Px1,269788,36
3,3IXaHlYXwpr89sa0cqGaum,Bul ma laaj,7M2i3utcB8cUT9PIDI6Px1,191903,32
4,6e04ppXc55WfiqmCgdCsO8,Noflaay,7M2i3utcB8cUT9PIDI6Px1,208998,29


In [158]:
host_name = 'localhost'
dbname = 'spotify_data_db'
port = '5432'
username = 'papes' 
password = '@Ndao20161696'
conn = None

In [159]:
def connect_to_db(host_name, dbname, port, username, password):
    try:
        conn = ps.connect(host=host_name, database=dbname, user=username, password=password, port=port)

    except ps.OperationalError as e:
        raise e
    else:
        print('Connected!')
        return conn