In [2]:
# src/preprocess_midi.py

import os
from pathlib import Path
from typing import List, Tuple

import numpy as np
from music21 import interval, pitch

In [3]:
# -----------------------------
# Config
# -----------------------------
RAW_MIDI_DIR = Path("../data/mini_dataset")
PROCESSED_DIR = Path("../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
TRANSPOSED_DIR = PROCESSED_DIR / "transposed_midis"
TRANSPOSED_DIR.mkdir(parents=True, exist_ok=True)


# how many notes per snippet (you can tweak)
SNIPPET_LENGTH = 32

# base rhythmic unit: 1 quarter note = 4 steps, so 1 step = sixteenth note
STEPS_PER_QUARTER = 4


In [4]:
# -----------------------------
# Helper functions
# -----------------------------

from music21 import converter, instrument, note, chord, stream, key, interval, pitch

def load_midi(filepath: Path) -> stream.Score:
    """Load a MIDI file into a music21 Score."""
    return converter.parse(str(filepath))


def pick_melody_part(score: stream.Score) -> stream.Part | None:
    """
    Heuristic for picking the 'melody' part:

    1. Skip *purely* percussion parts.
    2. If any part name/instrument name suggests 'melody/lead/right hand',
       pick that directly.
    3. Otherwise:
       - For each remaining part, compute:
         * n_notes
         * avg_pitch
       - Compute median avg_pitch across candidates.
       - Filter to parts with avg_pitch >= median (favor higher voices).
       - Among those, pick the one with the most notes; break ties by higher avg_pitch.

    Returns the chosen Part, or None if nothing suitable is found.
    """
    candidates = []

    for p in score.parts:
        insts = list(p.getInstruments())

        # Determine if this part is purely percussion (all instruments percussion-like)
        has_percussion = any(
            isinstance(i, instrument.UnpitchedPercussion) or
            ("percussion" in (i.bestName() or "").lower())
            for i in insts
        )
        has_non_percussion = any(
            not isinstance(i, instrument.UnpitchedPercussion) and
            "percussion" not in (i.bestName() or "").lower()
            for i in insts
        )

        # Skip only if it's *purely* percussion, not mixed
        if has_percussion and not has_non_percussion:
            continue

        # Collect notes/chords
        notes_chords = [n for n in p.recurse().notes if isinstance(n, (note.Note, chord.Chord))]
        if not notes_chords:
            continue

        # Basic stats
        pitches = []
        for n in notes_chords:
            if isinstance(n, note.Note):
                pitches.append(n.pitch.midi)
            elif isinstance(n, chord.Chord):
                pitches.append(max(nn.pitch.midi for nn in n.notes))

        if not pitches:
            continue

        n_notes = len(pitches)
        avg_pitch = sum(pitches) / len(pitches)

        # part/instrument names (lowercased)
        part_name = (p.partName or "").lower()
        inst_names = [str(inst.instrumentName or "").lower()
                      for inst in insts]

        candidates.append({
            "part": p,
            "n_notes": n_notes,
            "avg_pitch": avg_pitch,
            "part_name": part_name,
            "inst_names": inst_names,
        })

    if not candidates:
        print("  [warn] no suitable melodic parts; skipping this file.")
        return None

    # 1) Name-based shortcut: if any part name/instrument suggests "melody"
    name_keywords = [
        "melody", "lead", "right hand", "rh", "treble", "solo", "violin", "flute", "trumpet"
    ]

    def looks_like_melody(c):
        text = c["part_name"] + " " + " ".join(c["inst_names"])
        text = text.lower()
        return any(kw in text for kw in name_keywords)

    name_candidates = [c for c in candidates if looks_like_melody(c)]
    if name_candidates:
        # among these, pick the one with highest avg_pitch (just in case)
        best = max(name_candidates, key=lambda c: c["avg_pitch"])
        print(f"  [info] pick_melody_part: selected by name heuristic: "
              f"part_name='{best['part_name']}', avg_pitch={best['avg_pitch']:.1f}, n_notes={best['n_notes']}")
        return best["part"]

    # 2) Pitch-based filtering: keep only parts at or above median avg_pitch
    avg_pitches = [c["avg_pitch"] for c in candidates]
    median_pitch = sorted(avg_pitches)[len(avg_pitches) // 2]

    high_voice_candidates = [c for c in candidates if c["avg_pitch"] >= median_pitch]
    if not high_voice_candidates:
        high_voice_candidates = candidates  # fallback to all

    # 3) Among high-voice candidates, pick the one with the most notes & higher pitch
    best = max(
        high_voice_candidates,
        key=lambda c: (c["n_notes"], c["avg_pitch"])  # primary: many notes, secondary: higher pitch
    )

    print(
        f"  [info] pick_melody_part: selected by stats: "
        f"part_name='{best['part_name']}', avg_pitch={best['avg_pitch']:.1f}, "
        f"n_notes={best['n_notes']}"
    )

    return best["part"]




def detect_key_and_transpose(melody: stream.Part) -> stream.Part:
    """
    Detect key with music21 and transpose so tonic is C (for major) or A (for minor).
    If key detection fails for some reason, return the original melody.
    """
    try:
        key_guess = melody.analyze('key')
    except Exception as e:
        print("  [warn] key analysis failed, leaving melody untransposed:", e)
        return melody

    # Decide target tonic
    if key_guess.mode == 'major':
        target_pitch = pitch.Pitch('C')
    else:
        # treat minor keys as aiming for A minor tonic
        target_pitch = pitch.Pitch('A')

    # Build interval from current tonic to target tonic
    itvl = interval.Interval(key_guess.tonic, target_pitch)

    transposed = melody.transpose(itvl)
    return transposed


def extract_pitch_duration_sequence(melody: stream.Part) -> List[Tuple[int, float]]:
    """
    Extract (midi_pitch, quarter_length_duration) from a melody line.
    Ignore rests; collapse chords to their top note.
    """
    seq = []
    for elem in melody.recurse().notesAndRests:
        if isinstance(elem, note.Note):
            midi_pitch = elem.pitch.midi
            dur = float(elem.quarterLength)
            seq.append((midi_pitch, dur))
        elif isinstance(elem, chord.Chord):
            # take highest note in chord as melody approximation
            midi_pitch = max(n.pitch.midi for n in elem.notes)
            dur = float(elem.quarterLength)
            seq.append((midi_pitch, dur))
        else:
            # ignore rests and other stuff for now
            continue
    return seq


def convert_to_intervals_and_durations(
    pitch_dur_seq: List[Tuple[int, float]]
) -> Tuple[List[int], List[int]]:
    """
    Convert absolute pitches to pitch intervals and durations to integer steps.
    intervals[i] = pitch[i] - pitch[i-1], with first interval = 0
    durations[i] = round( quarter_length * STEPS_PER_QUARTER )
    """
    if not pitch_dur_seq:
        return [], []

    pitches = [p for (p, _) in pitch_dur_seq]
    durs_q = [d for (_, d) in pitch_dur_seq]

    intervals = [0]  # first note has no previous reference
    for i in range(1, len(pitches)):
        intervals.append(int(pitches[i] - pitches[i - 1]))

    durations = [max(1, int(round(d * STEPS_PER_QUARTER))) for d in durs_q]

    return intervals, durations


def make_snippets(
    intervals: List[int],
    durations: List[int],
    snippet_length: int = SNIPPET_LENGTH
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Slice sequences into fixed-length snippets.
    We use a simple sliding window with stride = snippet_length // 2 (50% overlap).
    Short sequences yield zero snippets.
    """
    assert len(intervals) == len(durations)
    n = len(intervals)
    if n < snippet_length:
        return np.empty((0, snippet_length), dtype=np.int32), np.empty((0, snippet_length), dtype=np.int32)

    stride = snippet_length // 2
    interval_snips = []
    duration_snips = []

    for start in range(0, n - snippet_length + 1, stride):
        end = start + snippet_length
        interval_snips.append(intervals[start:end])
        duration_snips.append(durations[start:end])

    return np.array(interval_snips, dtype=np.int32), np.array(duration_snips, dtype=np.int32)

def make_snippets_with_timestamps(
    intervals: List[int],
    durations: List[int],
    durations_q: List[float],
    snippet_length: int = SNIPPET_LENGTH
) -> Tuple[np.ndarray, np.ndarray, List[Tuple[int, int]]]:
    """
    Slice sequences into fixed-length snippets.
    Also return timestamp pairs (start_q, end_q) in quarter lengths.
    """
    assert len(intervals) == len(durations)
    n = len(intervals)
    if n < snippet_length:
        return np.empty((0, snippet_length), dtype=np.int32), np.empty((0, snippet_length), dtype=np.int32), []

    stride = snippet_length // 2
    interval_snips, duration_snips, timestamps = [], [], []

    cumulative_q = np.cumsum([0] + durations_q)  # cumulative time in quarter lengths

    for start in range(0, n - snippet_length + 1, stride):
        end = start + snippet_length
        interval_snips.append(intervals[start:end])
        duration_snips.append(durations[start:end])

        start_q = cumulative_q[start]
        end_q = cumulative_q[end]
        timestamps.append((int(round(start_q)), int(round(end_q))))

    return np.array(interval_snips, dtype=np.int32), np.array(duration_snips, dtype=np.int32), timestamps


In [5]:
from music21 import instrument

def sanitize_melody_instrument(melody_part):
    """
    Remove any existing Instrument metadata and force a clean, non-percussion
    instrument on a non-drum channel.
    """
    # Remove ALL Instrument objects from this part
    for inst in list(melody_part.recurse().getElementsByClass(instrument.Instrument)):
        try:
            melody_part.remove(inst)
        except Exception:
            pass

    # Set a friendly part name
    melody_part.partName = "Melody"

    # Insert one clean Piano instrument at the beginning
    piano = instrument.Piano()
    piano.midiProgram = 0  # Acoustic Grand
    piano.midiChannel = 0  # Channel 1 (NOT 10/drums)
    melody_part.insert(0, piano)

    return melody_part


In [6]:
import warnings
from music21.midi.translate import TranslateWarning
warnings.filterwarnings("ignore", category=TranslateWarning)

In [7]:
# -----------------------------
# Main preprocessing
# -----------------------------
def process_all_midis(rebuild_all: bool = False):
    """
    Preprocess MIDI files into fixed-length snippets.

    Args:
        rebuild_all (bool): 
            - If False (default): 
                * Load existing snippets.npz (if present)
                * Only process NEW MIDI files not already in midi_filenames
                * Append their snippets to the existing dataset
            - If True:
                * Ignore existing snippets.npz
                * Rebuild dataset from ALL MIDI files in RAW_MIDI_DIR
    """
    out_path = PROCESSED_DIR / "snippets.npz"

    # --------------------------------------------------
    # Collect all MIDI filenames in the raw directory
    # --------------------------------------------------
    
    
    midi_files = sorted(list(RAW_MIDI_DIR.rglob("*.mid")) +
                        list(RAW_MIDI_DIR.rglob("*.midi")))

    if not midi_files:
        print(f"No MIDI files found in {RAW_MIDI_DIR}. Nothing to do.")
        return

    # We'll fill these as we go
    all_interval_snips = []
    all_duration_snips = []
    all_song_ids = []

    # These are only used if we are appending (rebuild_all=False)
    existing_intervals = None
    existing_durations = None
    existing_song_ids = None
    existing_midi_filenames = None
    existing_filenames_set = set()

    # --------------------------------------------------
    # Load existing NPZ (if present and not rebuilding)
    # --------------------------------------------------
    if not rebuild_all and out_path.exists():
        print(f"Loading existing dataset: {out_path}")
        data = np.load(out_path, allow_pickle=True)

        existing_intervals = data["intervals"]
        existing_durations = data["durations"]
        existing_song_ids = data["song_ids"]
        existing_midi_filenames = data["midi_filenames"]  # 1D array of filenames

        existing_filenames_set = set(existing_midi_filenames.tolist())

        print(f"  Existing snippets: {existing_intervals.shape[0]}")
        print(f"  Existing MIDI files: {len(existing_filenames_set)}")
    elif rebuild_all:
        print("Rebuilding dataset from scratch; ignoring existing snippets.npz (if any).")

    # --------------------------------------------------
    # Decide which files to process
    # --------------------------------------------------
    if rebuild_all or existing_midi_filenames is None:
        # process ALL files
        files_to_process = midi_files
        base_song_idx = 0
        existing_midi_filenames_list = []
    else:
        # Only process files not already in midi_filenames
        files_to_process = [p for p in midi_files if p.name not in existing_filenames_set]
        base_song_idx = len(existing_midi_filenames)
        existing_midi_filenames_list = existing_midi_filenames.tolist()

    print(f"Found {len(midi_files)} total MIDI files.")
    print(f"{len(files_to_process)} file(s) to process this run.")

    if not files_to_process:
        print("No new MIDI files found. Dataset unchanged.")
        return

    # We'll also build up the new filenames in order
    new_filenames_list = []

    # --------------------------------------------------
    # Process selected MIDI files
    # --------------------------------------------------
    for local_idx, midi_path in enumerate(files_to_process):
        # song_id for this file within the full dataset:
        # existing songs first, then new ones in order
        song_id = base_song_idx + local_idx

        print(f"Processing {midi_path.name} "
              f"({local_idx + 1}/{len(files_to_process)}), assigned song_id={song_id}")

        try:
            score = load_midi(midi_path)
        except Exception as e:
            print(f"  Failed to load {midi_path.name}: {e}")
            continue

        melody = pick_melody_part(score)
        if melody is None:
            print(f"  Skipping {midi_path.name}: no usable melodic part found.")
            continue
        melody = detect_key_and_transpose(melody)
        melody = sanitize_melody_instrument(melody)

        # Save transposed melody
        out_midi_path = TRANSPOSED_DIR / f"transposed_{midi_path.stem}.mid"
        try:
            melody.write("midi", fp=str(out_midi_path))
            print(f"  Saved transposed: {out_midi_path}")
        except Exception as e:
            print(f"  [warn] could not save transposed MIDI for {midi_path.name}: {e}")

        pitch_dur_seq = extract_pitch_duration_sequence(melody)

        if len(pitch_dur_seq) < SNIPPET_LENGTH:
            print(f"  Skipping {midi_path.name}: too few notes ({len(pitch_dur_seq)})")
            continue

        intervals, durations = convert_to_intervals_and_durations(pitch_dur_seq)

        i_snips, d_snips = make_snippets(intervals, durations, SNIPPET_LENGTH)

        # durs_q = [d for (_, d) in pitch_dur_seq]
        # i_snips, d_snips, timestamp_pairs = make_snippets_with_timestamps(intervals, durations, durs_q, SNIPPET_LENGTH)

        # genre = midi_path.parent.name.lower()  # extract genre from folder

        if i_snips.shape[0] == 0:
            print(f"  No snippets extracted from {midi_path.name}")
            continue

        all_interval_snips.append(i_snips)
        all_duration_snips.append(d_snips)
        all_song_ids.append(np.full(i_snips.shape[0], song_id, dtype=np.int32))

        new_filenames_list.append(midi_path.name)

    # --------------------------------------------------
    # If nothing new was processed successfully
    # --------------------------------------------------
    if not all_interval_snips:
        print("No snippets extracted from selected MIDI files. Dataset unchanged.")
        return

    # Stack new snippets
    new_intervals = np.vstack(all_interval_snips)
    new_durations = np.vstack(all_duration_snips)
    new_song_ids = np.concatenate(all_song_ids)

    # --------------------------------------------------
    # Merge old + new or just use new (if rebuild_all or no existing)
    # --------------------------------------------------
    if not rebuild_all and existing_intervals is not None:
        intervals_arr = np.vstack([existing_intervals, new_intervals])
        durations_arr = np.vstack([existing_durations, new_durations])
        song_ids_arr = np.concatenate([existing_song_ids, new_song_ids])

        # Append new filenames after existing, in consistent order
        midi_filenames_arr = np.array(existing_midi_filenames_list + new_filenames_list)
    else:
        intervals_arr = new_intervals
        durations_arr = new_durations
        song_ids_arr = new_song_ids

        # When rebuilding, our song_ids are aligned with files_to_process in order
        # but we still want the full midi_files list in case some were skipped.
        midi_filenames_arr = np.array(new_filenames_list)

    # --------------------------------------------------
    # Save updated dataset
    # --------------------------------------------------
    np.savez_compressed(
        out_path,
        intervals=intervals_arr,
        durations=durations_arr,
        song_ids=song_ids_arr,
        midi_filenames=midi_filenames_arr,
    )

    print(f"Saved updated snippets to {out_path}")
    print(f"Total snippets: {intervals_arr.shape[0]}")
    print(f"Total MIDI files represented: {len(midi_filenames_arr)}")


In [158]:
process_all_midis(rebuild_all=True)

In [155]:
# -----------------------------
# Main preprocessing
# -----------------------------
def process_all_midis_new(rebuild_all: bool = False):
    """
    Preprocess MIDI files into fixed-length snippets.

    Args:
        rebuild_all (bool): 
            - If False (default): 
                * Load existing snippets.npz (if present)
                * Only process NEW MIDI files not already in midi_filenames
                * Append their snippets to the existing dataset
            - If True:
                * Ignore existing snippets.npz
                * Rebuild dataset from ALL MIDI files in RAW_MIDI_DIR
    """
    out_path = PROCESSED_DIR / "snippets.npz"

    # --------------------------------------------------
    # Collect all MIDI filenames in the raw directory
    # --------------------------------------------------
    midi_files = sorted(list(RAW_MIDI_DIR.rglob("*.mid")) +
                        list(RAW_MIDI_DIR.rglob("*.midi")))

    if not midi_files:
        print(f"No MIDI files found in {RAW_MIDI_DIR}. Nothing to do.")
        return

    # We'll fill these as we go
    all_interval_snips = []
    all_duration_snips = []
    all_song_ids = []
    all_genre_snips = []  # <-- per-snippet genres

    # These are only used if we are appending (rebuild_all=False)
    existing_intervals = None
    existing_durations = None
    existing_song_ids = None
    existing_midi_filenames = None
    existing_genres = None  # <-- existing genres (if any)
    existing_filenames_set = set()

    # --------------------------------------------------
    # Load existing NPZ (if present and not rebuilding)
    # --------------------------------------------------
    if not rebuild_all and out_path.exists():
        print(f"Loading existing dataset: {out_path}")
        data = np.load(out_path, allow_pickle=True)

        existing_intervals = data["intervals"]
        existing_durations = data["durations"]
        existing_song_ids = data["song_ids"]
        existing_midi_filenames = data["midi_filenames"]  # 1D array of filenames

        # If an older npz didn't have genres, we'll synthesize "unknown"
        if "genres" in data.files:
            existing_genres = data["genres"]
        else:
            existing_genres = np.array(
                ["unknown"] * existing_intervals.shape[0],
                dtype=object
            )

        existing_filenames_set = set(existing_midi_filenames.tolist())

        print(f"  Existing snippets: {existing_intervals.shape[0]}")
        print(f"  Existing MIDI files: {len(existing_filenames_set)}")
    elif rebuild_all:
        print("Rebuilding dataset from scratch; ignoring existing snippets.npz (if any).")

    # --------------------------------------------------
    # Decide which files to process
    # --------------------------------------------------
    if rebuild_all or existing_midi_filenames is None:
        # process ALL files
        files_to_process = midi_files
        base_song_idx = 0
        existing_midi_filenames_list = []
    else:
        # Only process files not already in midi_filenames
        files_to_process = [p for p in midi_files if p.name not in existing_filenames_set]
        base_song_idx = len(existing_midi_filenames)
        existing_midi_filenames_list = existing_midi_filenames.tolist()

    print(f"Found {len(midi_files)} total MIDI files.")
    print(f"{len(files_to_process)} file(s) to process this run.")

    if not files_to_process:
        print("No new MIDI files found. Dataset unchanged.")
        return

    # We'll also build up the new filenames in order
    new_filenames_list = []

    # --------------------------------------------------
    # Process selected MIDI files
    # --------------------------------------------------
    for local_idx, midi_path in enumerate(files_to_process):
        # song_id for this file within the full dataset:
        # existing songs first, then new ones in order
        song_id = base_song_idx + local_idx

        print(f"Processing {midi_path.name} "
              f"({local_idx + 1}/{len(files_to_process)}), assigned song_id={song_id}")

        try:
            score = load_midi(midi_path)
        except Exception as e:
            print(f"  Failed to load {midi_path.name}: {e}")
            continue

        melody = pick_melody_part(score)
        if melody is None:
            print(f"  Skipping {midi_path.name}: no usable melodic part found.")
            continue
        melody = detect_key_and_transpose(melody)
        melody = sanitize_melody_instrument(melody)

        # Save transposed melody
        out_midi_path = TRANSPOSED_DIR / f"transposed_{midi_path.stem}.mid"
        try:
            melody.write("midi", fp=str(out_midi_path))
            print(f"  Saved transposed: {out_midi_path}")
        except Exception as e:
            print(f"  [warn] could not save transposed MIDI for {midi_path.name}: {e}")

        pitch_dur_seq = extract_pitch_duration_sequence(melody)

        if len(pitch_dur_seq) < SNIPPET_LENGTH:
            print(f"  Skipping {midi_path.name}: too few notes ({len(pitch_dur_seq)})")
            continue

        intervals, durations = convert_to_intervals_and_durations(pitch_dur_seq)

        i_snips, d_snips = make_snippets(intervals, durations, SNIPPET_LENGTH)

        if i_snips.shape[0] == 0:
            print(f"  No snippets extracted from {midi_path.name}")
            continue

        # --- Genre per snippet (from folder name) ---
        genre = midi_path.parent.name.lower()
        num_snips = i_snips.shape[0]
        all_genre_snips.append(np.array([genre] * num_snips, dtype=object))

        all_interval_snips.append(i_snips)
        all_duration_snips.append(d_snips)
        all_song_ids.append(np.full(num_snips, song_id, dtype=np.int32))

        new_filenames_list.append(midi_path.name)

    # --------------------------------------------------
    # If nothing new was processed successfully
    # --------------------------------------------------
    if not all_interval_snips:
        print("No snippets extracted from selected MIDI files. Dataset unchanged.")
        return

    # Stack new snippets
    new_intervals = np.vstack(all_interval_snips)
    new_durations = np.vstack(all_duration_snips)
    new_song_ids = np.concatenate(all_song_ids)
    new_genres = np.concatenate(all_genre_snips)  # <-- all new snippet genres

    # --------------------------------------------------
    # Merge old + new or just use new (if rebuild_all or no existing)
    # --------------------------------------------------
    if not rebuild_all and existing_intervals is not None:
        intervals_arr = np.vstack([existing_intervals, new_intervals])
        durations_arr = np.vstack([existing_durations, new_durations])
        song_ids_arr = np.concatenate([existing_song_ids, new_song_ids])

        # Merge genres (old + new)
        genres_arr = np.concatenate([existing_genres, new_genres])

        # Append new filenames after existing, in consistent order
        midi_filenames_arr = np.array(existing_midi_filenames_list + new_filenames_list)
    else:
        intervals_arr = new_intervals
        durations_arr = new_durations
        song_ids_arr = new_song_ids
        genres_arr = new_genres  # only new
        midi_filenames_arr = np.array(new_filenames_list)

    # --------------------------------------------------
    # Save updated dataset (with genres)
    # --------------------------------------------------
    np.savez_compressed(
        out_path,
        intervals=intervals_arr,
        durations=durations_arr,
        song_ids=song_ids_arr,
        midi_filenames=midi_filenames_arr,
        genres=genres_arr,  # <-- new field
    )

    print(f"Saved updated snippets to {out_path}")
    print(f"Total snippets: {intervals_arr.shape[0]}")
    print(f"Total MIDI files represented: {len(midi_filenames_arr)}")


In [157]:
process_all_midis_new(rebuild_all = True)

In [148]:
import numpy as np

data_old= np.load("../data/processed/snippets_old.npz", allow_pickle=True)
data= np.load("../data/processed/snippets.npz", allow_pickle=True)

In [149]:
print(data.files)

['intervals', 'durations', 'song_ids', 'midi_filenames', 'genres', 'snippet_labels', 'snippet_start_indices', 'snippet_end_indices', 'snippet_start_qs', 'snippet_end_qs', 'snippet_start_secs', 'snippet_end_secs']


In [150]:
print(data_old.files)

['intervals', 'durations', 'song_ids', 'midi_filenames']


In [151]:
data['snippet_labels']

array(['classic_A. Baker Sweet Love_idx000000_to000032_t0000.00s_to0036.68s',
       'classic_A. Baker Sweet Love_idx000016_to000048_t0016.78s_to0058.57s',
       'classic_A. Baker Sweet Love_idx000032_to000064_t0036.68s_to0077.63s',
       'classic_A. Baker Sweet Love_idx000048_to000080_t0058.57s_to0082.08s',
       'classic_A. Baker Sweet Love_idx000064_to000096_t0077.63s_to0087.77s',
       'classic_A. Baker Sweet Love_idx000080_to000112_t0082.08s_to0092.32s',
       'classic_A. Baker Sweet Love_idx000096_to000128_t0087.77s_to0103.12s',
       'classic_A. Baker Sweet Love_idx000112_to000144_t0092.32s_to0128.14s',
       'classic_A. Baker Sweet Love_idx000128_to000160_t0103.12s_to0136.58s',
       'classic_A. Baker Sweet Love_idx000144_to000176_t0128.14s_to0141.13s',
       'classic_A. Baker Sweet Love_idx000160_to000192_t0136.58s_to0146.82s',
       'classic_A. Baker Sweet Love_idx000176_to000208_t0141.13s_to0151.37s',
       'classic_A. Baker Sweet Love_idx000192_to000224_t0146.82s

In [53]:
data_old['intervals']

array([[  0,  -3,  -5, ...,   4,   0, -11],
       [  0,   2,  -4, ...,  -5,   0,   3],
       [  4,   3,   4, ...,   2,  -7,   7],
       ...,
       [ -1,  -5,   1, ...,   0,   0,   0],
       [  2,  -5,   3, ...,   3,   2,  -5],
       [  0,   3, -10, ...,  -3,  -2,  -2]], dtype=int32)