In [0]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
from datetime import datetime
import time
import pandas as pd
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable

In [0]:
spark.conf.set("spark.databricks.io.directoryCommit.createSuccessFile","false") 
spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
spark.conf.set("spark.sql.sources.commitProtocolClass", "org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol")

In [0]:
# Check if the mount point already exists
mount_point = "/mnt/spotifydata"
mounts = dbutils.fs.mounts()
mount_exists = any(mount.mountPoint == mount_point for mount in mounts)

if not mount_exists:
  # get secret from Azure Key Vault
  adls_client_id = dbutils.secrets.get(scope = "secret-store", key = "adls-spotify-client-id")
  adls_client_secret = dbutils.secrets.get(scope = "secret-store", key = "adls-spotify-client-secret")
  adls_tenant_id = dbutils.secrets.get(scope = "secret-store", key = "adls-spotify-tenant-id")

  configs = {"fs.azure.account.auth.type": "OAuth",
  "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
  "fs.azure.account.oauth2.client.id": adls_client_id,
  "fs.azure.account.oauth2.client.secret": adls_client_secret,
  "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{adls_tenant_id}/oauth2/token"}

  dbutils.fs.mount(
    source = "abfss://spotifydata@spotifyproject888.dfs.core.windows.net", # contrainer@storageacc
    mount_point = mount_point,
    extra_configs = configs)

In [0]:
# get secret from Azure Key Vault
client_id = dbutils.secrets.get(scope = "secret-store", key = "spotify-client-id")
client_secret = dbutils.secrets.get(scope = "secret-store", key = "spotify-client-secret")

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager, requests_timeout=30)

In [0]:
current_year = datetime.now().year

In [0]:
# Get tracks in the current year
track_list=[]
for i in range(0,1000,50):
  track_result = sp.search(q = f'year:{current_year}', type="track", limit=50, offset=i, market='US')
  for row in track_result['tracks']['items']:
      artist_id = row['artists'][0]['id'] 
      track_id = row['id']
      track_name = row['name']
      track_popularity = row['popularity']
      track_release_date = row['album']['release_date']
      track_list.append({'artist_id':artist_id, 'track_id':track_id, 'track_name':track_name, 'track_popularity':track_popularity, 'track_release_date':track_release_date})
track_df = pd.DataFrame.from_dict(track_list)

In [0]:
# Get artist from the track
artist_list = []
artist_id_list = track_df['artist_id'].to_list()
artist_id_set = set(artist_id_list)
artist_id_list_unique = list(artist_id_set)

for row in artist_id_list_unique:
  artist_result = sp.artist(row)
  artist_id = artist_result['id']
  artist_name = artist_result['name']
  artist_popularity = artist_result['popularity']
  artist_follower = artist_result['followers']['total']
  if len(artist_result['images']) == 0:
    artist_image = None
  else:
    artist_image = artist_result['images'][0]['url'] 
  artist_list.append({'artist_id':artist_id, 'artist_name': artist_name , 'artist_popularity':artist_popularity, 'artist_follower':artist_follower, 'artist_image':artist_image})

artist_df = pd.DataFrame.from_dict(artist_list)


In [0]:
# Convert to spark dataframe 
track_spark_df = spark.createDataFrame(track_df)
artist_spark_df = spark.createDataFrame(artist_df)

In [0]:
today_date = datetime.now().strftime('%Y%m%d')  # Format: yyyymmdd

track_folder = "/mnt/spotifydata/to_be_processed/track"
artist_folder = "/mnt/spotifydata/to_be_processed/artist"
track_file_name = f"track_{today_date}.parquet"
artist_file_name = f"artist_{today_date}.parquet"

In [0]:
# Step 1: Write the DataFrame to the output folder (coalescing to a single file)
track_spark_df.coalesce(1) \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .option("header", "true") \
    .save(track_folder)
          
artist_spark_df.coalesce(1) \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .option("header", "true") \
    .save(artist_folder)

In [0]:
def rename_parquet_file(folder_path, target_file_name):

  # Step 2: List all files in the output folder
  files = dbutils.fs.ls(folder_path)

  # Step 3: Identify the Parquet file generated by Spark (starts with 'part')
  parquet_file = [file.path for file in files if file.name.startswith("part") and file.name.endswith(".parquet")]

  if parquet_file:
    # Step 4: Rename the Parquet file to the desired name
    dbutils.fs.mv(parquet_file[0], f"{folder_path}/{target_file_name}")
    
     # Step 5: Remove any other system-generated files (e.g., _SUCCESS)
    for file in files:
        if file.name != target_file_name:
            dbutils.fs.rm(file.path, recurse=False)

In [0]:
rename_parquet_file(track_folder, track_file_name)
rename_parquet_file(artist_folder, artist_file_name)