In [1]:
import os
import shutil
import tarfile
import pandas as pd
from pathlib import Path

In [None]:
# Define paths
zipped_data_dir = Path('zipped_data')
data_dir = Path('data')
target_dir = data_dir / 'McGill-Billboard'

# Create target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

# Dictionary to map file types to subdirectories
file_type_dirs = {
    'chordino': target_dir / 'chordino',
    'salami_chords': target_dir / 'annotations',
    'lab': target_dir / 'lab',
    'mirex': target_dir / 'mirex'
}

# Create subdirectories
for directory in file_type_dirs.values():
    os.makedirs(directory, exist_ok=True)

# Function to extract files
def extract_tar_file(tar_path, dest_dir, file_type):
    print(f"Extracting {tar_path}...")
    
    # Open the tar file
    with tarfile.open(tar_path, 'r:*') as tar:
        # Iterate through members
        for member in tar.getmembers():
            # Skip if not a file or if it's a directory
            if not member.isfile():
                continue
                
            # Extract directory ID from path
            path_parts = member.name.split('/')
            if len(path_parts) < 2 or not path_parts[1].isdigit():
                continue
                
            dir_id = path_parts[1]
            filename = path_parts[-1]
            
            # Create directory for this ID if needed
            id_dir = file_type_dirs[file_type] / dir_id
            os.makedirs(id_dir, exist_ok=True)
            
            # Extract file to the correct location
            member.name = filename
            tar.extract(member, path=id_dir)
            
            # Fix the extracted file path
            extracted_path = id_dir / filename
            if not extracted_path.exists():
                src_path = Path(id_dir) / member.name
                if src_path.exists():
                    shutil.move(str(src_path), str(extracted_path))
                    
    print(f"Extraction of {tar_path} complete")

# List of tar files to extract
tar_files = [
    ('billboard-2.0-chordino.tar.xz', 'chordino'),
    ('billboard-2.0-salami_chords.tar.xz', 'salami_chords'),
    ('billboard-2.0.1-lab.tar.xz', 'lab'),
    ('billboard-2.0.1-mirex.tar.xz', 'mirex')
]

# Extract each tar file
for filename, file_type in tar_files:
    tar_path = zipped_data_dir / filename
    if tar_path.exists():
        extract_tar_file(tar_path, target_dir, file_type)
    else:
        print(f"Warning: {tar_path} not found")

# Copy the index file to the target directory
index_file = zipped_data_dir / 'billboard-2.0-index.csv'
if index_file.exists():
    shutil.copy(index_file, target_dir / 'index.csv')
    print(f"Copied index file to {target_dir / 'index.csv'}")
else:
    print(f"Warning: Index file {index_file} not found")

print("Dataset organization complete")

Extracting zipped_data\billboard-2.0-chordino.tar.xz...


  tar.extract(member, path=id_dir)


Extraction of zipped_data\billboard-2.0-chordino.tar.xz complete
Extracting zipped_data\billboard-2.0-salami_chords.tar.xz...
Extraction of zipped_data\billboard-2.0-salami_chords.tar.xz complete
Extracting zipped_data\billboard-2.0.1-lab.tar.xz...
Extraction of zipped_data\billboard-2.0.1-lab.tar.xz complete
Dataset organization complete
