In [4]:
import os

# ✅ Works in BOTH Jupyter and .py files
if "__file__" in globals():
    PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
else:
    PROJECT_ROOT = os.getcwd()   # Jupyter fallback

print("✅ Project root:", PROJECT_ROOT)

# ✅ Go one level up
PARENT_ROOT = os.path.dirname(PROJECT_ROOT)

# ✅ Enter the data folder
DATA_ROOT = os.path.join(PARENT_ROOT, "data")

# ✅ Clean normalized path
DATA_ROOT = os.path.abspath(DATA_ROOT)

print("✅ Data root:", DATA_ROOT)

✅ Project root: /home/sagarj2/AML_project/CS-441-Applied-ML-Project/notebooks
✅ Data root: /home/sagarj2/AML_project/CS-441-Applied-ML-Project/data


In [5]:
# data_url = "http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz"
# target_path = os.path.join(DATA_ROOT, "lmd_matched.tar.gz")

# # Download using wget via shell
# !wget -O "{target_path}" "{data_url}"

--2025-12-07 02:40:15--  http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz
Resolving hog.ee.columbia.edu (hog.ee.columbia.edu)... 128.59.66.5
Connecting to hog.ee.columbia.edu (hog.ee.columbia.edu)|128.59.66.5|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1407072670 (1.3G) [application/x-gzip]
Saving to: ‘/home/sagarj2/AML_project/CS-441-Applied-ML-Project/data/lmd_matched.tar.gz’


2025-12-07 02:42:16 (11.1 MB/s) - ‘/home/sagarj2/AML_project/CS-441-Applied-ML-Project/data/lmd_matched.tar.gz’ saved [1407072670/1407072670]



In [6]:
# # Extract into the data folder
# !tar -xvzf "{target_path}" -C "{DATA_ROOT}"

In [7]:
import os
import json
import shutil
import random
import re

# ==========================
# CONFIG
# ==========================
MIDI_ROOT   = os.path.join(DATA_ROOT, "lmd_matched")
LASTFM_ROOT = os.path.join(DATA_ROOT, "lastfm")
MD5_JSON    = os.path.join(DATA_ROOT, "md5_to_paths.json")
OUTPUT_ROOT = os.path.join(DATA_ROOT, "mini_dataset")

# MIDI_ROOT = "/home/sagarj2/AML_project/lakh_dataset/lmd_matched"
# LASTFM_ROOT = "/home/sagarj2/AML_project/lastfm"
# MD5_JSON = "/home/sagarj2/AML_project/md5_to_paths.json"
# OUTPUT_ROOT = "/home/sagarj2/AML_project/mini_dataset"

# ==========================
# REMOVE OLD DATA
# ==========================
if os.path.exists(OUTPUT_ROOT):
    print(f"Removing old data in: {OUTPUT_ROOT}")
    shutil.rmtree(OUTPUT_ROOT)  # deletes the folder and all its contents

# Recreate empty folder
os.makedirs(OUTPUT_ROOT)
print(f"Folder ready: {OUTPUT_ROOT}")

# ==========================
# ==========================


CATEGORIES = ["pop", "classic", "guitar", "electronic", "piano", "hip-hop", "rock", "rnb", "dance", "folk"]
NUM_PER_CATEGORY = 100

# ==========================
# LOAD MD5 JSON
# ==========================
with open(MD5_JSON, "r") as f:
    md5_to_paths = json.load(f)

print(f"Loaded MD5 mapping for {len(md5_to_paths)} keys.")

# ==========================
# PROCESS EACH CATEGORY
# ==========================
for category in CATEGORIES:
    print(f"\nProcessing category: {category}")

    category_file = os.path.join(LASTFM_ROOT, f"id_list_{category}.txt")
    if not os.path.exists(category_file):
        print(f"⚠ File not found: {category_file}")
        continue

    # Read track IDs
    with open(category_file, "r") as f:
        track_ids = [line.strip() for line in f.readlines()]


    # print(f'Track_ids:{track_ids}')

    # Shuffle to pick random tracks
    # random.shuffle(track_ids)

    copied_count = 0
    for track_id in track_ids:
        if copied_count >= NUM_PER_CATEGORY:
            break

        # Build path: e.g., /B/O/E/TRBOEFO128F92FC62E
        folder_structure = os.path.join(MIDI_ROOT, track_id[2], track_id[3], track_id[4], track_id)
        if not os.path.exists(folder_structure):
            print(f"⚠ Track folder missing: {folder_structure}")
            continue

        track_folder = folder_structure

        # Get first MIDI file (hashed filename)
        midi_files = [f for f in os.listdir(track_folder) if f.lower().endswith(".mid")]
        if not midi_files:
            print(f"⚠ No MIDI file in {track_folder}")
            continue

        hashed_midi = midi_files[0]          # e.g., "3ec35220a42eac4c124885690f349eb1.mid"
        md5_key = os.path.splitext(hashed_midi)[0]  # "3ec35220a42eac4c124885690f349eb1"

        print(f'hashed_midi:{md5_key}')
        

        # Lookup in md5_to_paths.json
        if md5_key not in md5_to_paths:
            print(f"⚠ MD5 not found in JSON: {hashed_midi}")
            continue

        new_file_name = md5_to_paths[md5_key][0]  # take first path

        print(f'new_file_name:{new_file_name}')


        # ==========================
        # COPY AND RENAME MIDI FILE
        # ==========================
        
        # If your original MIDI files are stored relative to some root folder, set it here
        # ORIGINAL_ROOT = "/home/sagarj2/AML_project/lakh_dataset/lmd_matched"
        
        # Full source path of the hashed MIDI
        src_path = os.path.join(track_folder, hashed_midi)
        print(f'src_path:{src_path}')
        
        # Destination folder for the category
        dest_folder = os.path.join(OUTPUT_ROOT, category)
        os.makedirs(dest_folder, exist_ok=True)
        
        # Rename the MIDI using the name from JSON (keep only basename)
        # dest_path = os.path.join(dest_folder, os.path.basename(new_file_name))


        # Sanitize filename from JSON to avoid illegal characters
        safe_name = re.sub(r'[<>:"/\\|?*]', '_', os.path.basename(new_file_name))
        safe_name = safe_name.replace("-", "_")
        dest_path = os.path.join(dest_folder, safe_name)
        print(f'dest_path:{dest_path}')
        
        # Copy and rename with try-except
        try:
            shutil.copy(src_path, dest_path)
            
            copied_count += 1
            print(f"✅ Copied and renamed {copied_count}/{NUM_PER_CATEGORY}: {dest_path}\n")
        except PermissionError:
            print(f"❌⚠ Permission denied, skipping: {dest_path}")

    print(f"Finished category {category}. Total copied: {copied_count}")

Folder ready: /home/sagarj2/AML_project/CS-441-Applied-ML-Project/data/mini_dataset
Loaded MD5 mapping for 178561 keys.

Processing category: pop
hashed_midi:3ec35220a42eac4c124885690f349eb1
new_file_name:Adams Bryan/Bryan Adams - Dream on.mid
src_path:/home/sagarj2/AML_project/CS-441-Applied-ML-Project/data/lmd_matched/B/O/E/TRBOEFO128F92FC62E/3ec35220a42eac4c124885690f349eb1.mid
dest_path:/home/sagarj2/AML_project/CS-441-Applied-ML-Project/data/mini_dataset/pop/Bryan Adams _ Dream on.mid
✅ Copied and renamed 1/100: /home/sagarj2/AML_project/CS-441-Applied-ML-Project/data/mini_dataset/pop/Bryan Adams _ Dream on.mid

hashed_midi:9e74480ab3b66ee53eff617152eb68c2
new_file_name:M/michael_buble-havent_met_you_yet.mid
src_path:/home/sagarj2/AML_project/CS-441-Applied-ML-Project/data/lmd_matched/M/R/H/TRMRHAD12903CC0130/9e74480ab3b66ee53eff617152eb68c2.mid
dest_path:/home/sagarj2/AML_project/CS-441-Applied-ML-Project/data/mini_dataset/pop/michael_buble_havent_met_you_yet.mid
✅ Copied and ren