Spotify Music Analysis Script

This script analyzes music tracks from a Spotify playlist to extract various features
and characteristics, creating a comprehensive dataset for music analysis.

Requirements:
    - spotipy library (pip install spotipy)
    - pandas library
    - Valid Spotify Developer credentials

The script performs the following steps:
1. Authentication with Spotify API
2. Retrieval of playlist tracks
3. Extraction of track and artist features
4. Creation of a consolidated dataset

In [9]:
!pip install spotipy --quiet

In [10]:
import os
from typing import List, Dict, Any, Optional
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [11]:
def authenticate_spotify(client_id: str, client_secret: str) -> spotipy.Spotify:
    """
    Authenticate with Spotify API using client credentials.

    Args:
        client_id: Spotify API client ID
        client_secret: Spotify API client secret

    Returns:
        Authenticated Spotify client object

    Raises:
        spotipy.SpotifyException: If authentication fails
    """
    auth_manager = SpotifyClientCredentials(
        client_id=client_id,
        client_secret=client_secret
    )
    return spotipy.Spotify(client_credentials_manager=auth_manager)


In [12]:
def extract_playlist_uri(playlist_link: str) -> str:
    """
    Extract playlist URI from Spotify playlist link.

    Args:
        playlist_link: Full Spotify playlist URL

    Returns:
        Playlist URI (last part of the URL before any query parameters)
    """
    return playlist_link.split("/")[-1].split("?")[0]

In [13]:
def get_all_playlist_tracks(sp: spotipy.Spotify, playlist_uri: str) -> List[Dict]:
    """
    Extract all tracks from a playlist, handling pagination.

    Args:
        sp: Authenticated Spotify client
        playlist_uri: Playlist identifier

    Returns:
        List of all tracks in the playlist
    """
    tracks = []
    results = sp.playlist_tracks(playlist_uri)
    tracks.extend(results['items'])

    # Handle pagination
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])

    print(f"Total tracks retrieved: {len(tracks)}")
    return tracks

In [14]:
def process_tracks_in_batches(sp: spotipy.Spotify, tracks: List[Dict], batch_size: int = 50) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Process tracks in batches to avoid API rate limits.

    Args:
        sp: Authenticated Spotify client
        tracks: List of track information
        batch_size: Number of tracks to process in each batch

    Returns:
        Tuple of two DataFrames:
        - Audio features DataFrame
        - Track metadata DataFrame
    """
    track_data = {
        "track_uri": [],
        "track_name": [],
        "artist_name": [],
        "artist_popularity": [],
        "artist_genres": [],
        "album_name": [],
        "track_popularity": []
    }

    audio_features_list = []
    total_tracks = len(tracks)

    for i in range(0, total_tracks, batch_size):
        batch = tracks[i:i + batch_size]
        batch_uris = []

        print(f"\nProcessing batch {i//batch_size + 1} of {(total_tracks + batch_size - 1)//batch_size}")

        # Extract basic track and artist information
        for track in batch:
            if track['track'] is None:  # Skip any None/null tracks
                continue

            track_info = track['track']
            batch_uris.append(track_info['uri'])

            # Get artist information (with rate limit consideration)
            try:
                artist_info = sp.artist(track_info['artists'][0]['uri'])

                track_data["track_uri"].append(track_info["uri"])
                track_data["track_name"].append(track_info["name"])
                track_data["artist_name"].append(track_info["artists"][0]["name"])
                track_data["artist_popularity"].append(artist_info["popularity"])
                track_data["artist_genres"].append(artist_info["genres"])
                track_data["album_name"].append(track_info["album"]["name"])
                track_data["track_popularity"].append(track_info["popularity"])
            except Exception as e:
                print(f"Error processing track {track_info.get('name', 'unknown')}: {str(e)}")
                continue

        # Get audio features for the batch
        try:
            batch_features = sp.audio_features(batch_uris)
            audio_features_list.extend([f for f in batch_features if f is not None])
        except Exception as e:
            print(f"Error getting audio features for batch: {str(e)}")

    # Create DataFrames
    metadata_df = pd.DataFrame(track_data)
    audio_features_df = pd.DataFrame(audio_features_list)

    print(f"\nProcessed {len(metadata_df)} tracks successfully")
    print(f"Audio features retrieved for {len(audio_features_df)} tracks")

    return audio_features_df, metadata_df

In [15]:
def create_final_dataset(
    audio_features_df: pd.DataFrame,
    metadata_df: pd.DataFrame,
    output_path: str
) -> pd.DataFrame:
    """
    Combine audio features and metadata into a final dataset and save to CSV.

    Args:
        audio_features_df: DataFrame containing audio features
        metadata_df: DataFrame containing track metadata
        output_path: Path where the CSV file will be saved

    Returns:
        Combined DataFrame with all track information
    """
    # Merge DataFrames on track_uri
    final_df = pd.merge(
        audio_features_df,
        metadata_df,
        left_on='uri',
        right_on='track_uri',
        how='inner'
    )

    # Remove duplicate columns
    final_df = final_df.loc[:,~final_df.columns.duplicated()]

    # Save to CSV
    final_df.to_csv(output_path, index=False)

    print(f"\nFinal dataset shape: {final_df.shape}")
    print(f"Dataset saved to {output_path}")

    return final_df

In [16]:
# Configuration
CLIENT_ID = "your_client_id"  # Replace with your Spotify client ID
CLIENT_SECRET = "your_client_secret"  # Replace with your Spotify client secret
PLAYLIST_LINK = "your_playlist_link"  # Replace with your playlist link
OUTPUT_PATH = "music_analysis.csv"

# Initialize Spotify client
spotify_client = authenticate_spotify(CLIENT_ID, CLIENT_SECRET)

# Get playlist URI
playlist_uri = extract_playlist_uri(PLAYLIST_LINK)

# Get all tracks from playlist
all_tracks = get_all_playlist_tracks(spotify_client, playlist_uri)

# Process tracks in batches
audio_features_df, metadata_df = process_tracks_in_batches(spotify_client, all_tracks)

# Create and save final dataset
final_dataset = create_final_dataset(audio_features_df, metadata_df, OUTPUT_PATH)

Total tracks retrieved: 1296

Processing batch 1 of 26

Processing batch 2 of 26

Processing batch 3 of 26

Processing batch 4 of 26

Processing batch 5 of 26

Processing batch 6 of 26

Processing batch 7 of 26

Processing batch 8 of 26

Processing batch 9 of 26

Processing batch 10 of 26

Processing batch 11 of 26

Processing batch 12 of 26

Processing batch 13 of 26

Processing batch 14 of 26

Processing batch 15 of 26

Processing batch 16 of 26

Processing batch 17 of 26

Processing batch 18 of 26

Processing batch 19 of 26

Processing batch 20 of 26

Processing batch 21 of 26

Processing batch 22 of 26

Processing batch 23 of 26

Processing batch 24 of 26

Processing batch 25 of 26

Processing batch 26 of 26

Processed 1296 tracks successfully
Audio features retrieved for 1296 tracks

Final dataset shape: (1318, 25)
Dataset saved to music_analysis.csv
