In [1]:
from tqdm import tqdm
import boto3
import torch
import json
import tempfile
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
import sys
import glob
import gc
import time
import shutil
import pandas as pd
import os

  import pynvml  # type: ignore[import]


In [None]:
# # Download metadata.xlsx files for ground truth (these are small)

# BUCKET_NAME = 'xaviera-training-file'
# PREFIX = '000001/'
# SAMPLE_SONGS_ROOT = 'songs_metadata'

# s3 = boto3.client('s3')


# def download_metadata_files():
#     print("Downloading metadata.xlsx files for ground truth...")
#     paginator = s3.get_paginator('list_objects_v2')
#     pages = paginator.paginate(Bucket=BUCKET_NAME, Prefix=PREFIX)
#     for page in pages:
#         if 'Contents' not in page:
#             continue
#         for obj in page['Contents']:
#             key = obj['Key']
#             if key.endswith('metadata.xlsx'):
#                 parts = key.split('/')
#                 if len(parts) >= 3:
#                     genre = parts[1]
#                     target_path = os.path.join(SAMPLE_SONGS_ROOT, genre, 'metadata.xlsx')
#                     os.makedirs(os.path.dirname(target_path), exist_ok=True)
#                     if not os.path.exists(target_path):
#                         print(f"  Downloading {genre}/metadata.xlsx")
#                         s3.download_file(BUCKET_NAME, key, target_path)


# download_metadata_files()
# print("Metadata download complete.")

In [5]:
# =============================================================================
# EXTRACT METADATA FROM metadata.xlsx FILES
# =============================================================================
# This cell reads metadata.xlsx from each genre folder and generates
# _prompt.txt, _lyrics.txt, _duration.txt files in generated_audio_metadata/[Genre]/[CatalogID]/
# NOTE: Run this AFTER downloading metadata.xlsx files from S3
SAMPLE_SONGS_DIR = 'songs_metadata'  # Contains Genre/metadata.xlsx
OUTPUT_DIR = 'generated_audio_metadata'  # Output: Genre/CatalogID/files


def parse_duration(duration_str):
    """
    Parses duration string (e.g., "3:12", "03:12", "45") into seconds (float).
    Returns 30.0 if parsing fails.
    """
    if pd.isna(duration_str) or duration_str == '':
        return 30.0
    try:
        parts = str(duration_str).strip().split(':')
        if len(parts) == 2:
            minutes = float(parts[0])
            seconds = float(parts[1])
            return minutes * 60 + seconds
        elif len(parts) == 3:
            hours = float(parts[0])
            minutes = float(parts[1])
            seconds = float(parts[2])
            return hours * 3600 + minutes * 60 + seconds
        else:
            return float(duration_str)
    except ValueError:
        return 30.0


def construct_prompt(row):
    """
    Constructs prompt string based on guidelines:
    - Genre, VocalType, Instruments, Moods, Tempo/BPM, Key
    """
    parts = []

    def get_str(val):
        if pd.isna(val):
            return ""
        return str(val).strip()

    # Genre
    genre = get_str(row.get('Genre'))
    if genre:
        parts.append(f"{genre}")

    # Vocal Type
    vocal_type = get_str(row.get('VocalType'))
    if vocal_type:
        parts.append(f"{vocal_type}")
    
    # Instruments
    instruments = get_str(row.get('Instruments'))
    if instruments:
        parts.append(f"{instruments}")
    
    # Moods
    moods = get_str(row.get('Moods'))
    if moods:
        parts.append(f"{moods}")
    
    # Tempo/BPM
    bpm = get_str(row.get('BPM'))
    tempo = get_str(row.get('Tempo'))
    if bpm and bpm != '0':
        parts.append(f"{bpm} bpm")
    elif tempo:
        parts.append(f"{tempo}")
    
    # Key
    key = get_str(row.get('ai_MusicalKey'))
    if key:
        parts.append(f"{key}")
        
    return ", ".join(parts)

def extract_metadata():
    print(f"Extracting metadata from {SAMPLE_SONGS_DIR}...")
    print(f"Saving to {OUTPUT_DIR}/[Genre]/[CatalogID]/")
    
    if not os.path.exists(SAMPLE_SONGS_DIR):
        print(f"Error: {SAMPLE_SONGS_DIR} does not exist.")
        return
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    total_processed = 0
    
    # Iterate through genre folders
    for genre in os.listdir(SAMPLE_SONGS_DIR):
        genre_dir = os.path.join(SAMPLE_SONGS_DIR, genre)
        if not os.path.isdir(genre_dir):
            continue
        
        # Check for metadata.xlsx
        metadata_path = os.path.join(genre_dir, "metadata.xlsx")
        if not os.path.exists(metadata_path):
            print(f"  Warning: metadata.xlsx not found in {genre}")
            continue
        
        try:
            df = pd.read_excel(metadata_path)
            df['CatalogID'] = df['CatalogID'].astype(str)
            
            genre_processed = 0
            
            # Iterate through metadata rows
            for _, row in df.iterrows():
                catalog_id = row['CatalogID']
                
                # Output directory: generated_audio_metadata/Genre/CatalogID/
                output_catalog_dir = os.path.join(OUTPUT_DIR, genre, catalog_id)
                os.makedirs(output_catalog_dir, exist_ok=True)
                
                # Extract Prompt
                prompt_text = construct_prompt(row)
                prompt_path = os.path.join(output_catalog_dir, f"{catalog_id}_prompt.txt")
                with open(prompt_path, 'w', encoding='utf-8') as pf:
                    pf.write(prompt_text)
                
                # Extract Lyrics
                lyrics_val = row.get('ai_LyricTranscription')
                if pd.isna(lyrics_val) or str(lyrics_val).strip() == '':
                    lyrics_text = "[Instrumental]"
                else:
                    lyrics_text = str(lyrics_val)
                
                lyrics_path = os.path.join(output_catalog_dir, f"{catalog_id}_lyrics.txt")
                with open(lyrics_path, 'w', encoding='utf-8') as lf:
                    lf.write(lyrics_text)
                
                # Extract Duration
                duration_val = row.get('Duration')
                duration_seconds = parse_duration(duration_val)
                
                duration_path = os.path.join(output_catalog_dir, f"{catalog_id}_duration.txt")
                with open(duration_path, 'w', encoding='utf-8') as df_out:
                    df_out.write(str(duration_seconds))
                
                genre_processed += 1
            
            print(f"  {genre}: extracted {genre_processed} songs")
            total_processed += genre_processed
            
        except Exception as e:
            print(f"  Error processing {genre}: {e}")
    
    print(f"\nMetadata extraction complete! Total: {total_processed} songs")

# Run the extraction
extract_metadata()


Extracting metadata from songs_metadata...
Saving to generated_audio_metadata/[Genre]/[CatalogID]/


  warn("Workbook contains no default style, apply openpyxl's default")


  Folk: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Reggaeton: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Action: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Singer: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Asian: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Themes: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Cartoon: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Comedy: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Indian: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  RnB: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  East European: extracted 342 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Hip Hop: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  World Fusion: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Circus: extracted 309 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  European: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Science Fiction: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Tributes: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Hawaiian: extracted 277 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Chinese: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  British & Irish: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Americana: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Sports: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Reggae: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Horror: extracted 500 songs
  National Anthem: extracted 230 songs


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


  Dance: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Danish: extracted 286 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Lounge: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  New Age: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Alternative: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Swedish: extracted 409 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Pop: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  UK Bass: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Country: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Middle East: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Classical: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Russian: extracted 309 songs
  Acoustic: extracted 160 songs


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


  Jazz: extracted 500 songs
  Animal: extracted 84 songs


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


  Gothic: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Latin: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Spiritual: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Rock: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Brazilian: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Blues: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Corporate: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Military: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Thai: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Dangerous: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Christmas: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Drums: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Electronic: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  African: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Childrens: extracted 500 songs
  Vietnamese: extracted 69 songs


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


  Dramatic: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Tropical: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  News: extracted 500 songs


  warn("Workbook contains no default style, apply openpyxl's default")


  Japanese: extracted 500 songs

Metadata extraction complete! Total: 27475 songs


In [3]:
pwd

'/home/ec2-user/SageMaker/xaviera-lora-finetuning/xaviera-lora/pop_metadata/233872'

In [4]:
cd ../../

/home/ec2-user/SageMaker/xaviera-lora-finetuning/xaviera-lora
