<a href="https://colab.research.google.com/github/room-creator/sample/blob/main/resources/inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install piano_transcription_inference

In [1]:
!apt install ffmpeg wget
!pip uninstall librosa numpy -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
wget is already the newest version (1.21.2-2ubuntu1.1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Found existing installation: librosa 0.8.1
Uninstalling librosa-0.8.1:
  Successfully uninstalled librosa-0.8.1
Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2


In [2]:
!pip install piano_transcription_inference

Collecting librosa (from piano_transcription_inference)
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting numpy>=1.22.3 (from librosa->piano_transcription_inference)
  Using cached numpy-2.3.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
  Using cached numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached librosa-0.11.0-py3-none-any.whl (260 kB)
Using cached numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.2 MB)
Installing collected packages: numpy, librosa
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [librosa]
[1A[2KSuccessfully installed librosa-0.11.0 numpy-2.0.2


In [3]:
from IPython.display import Audio

# Transcribe using code

In [4]:
import numpy as np
import os
import librosa
import audioread

def load_audio(path, sr=22050, mono=True, offset=0.0, duration=None,
    dtype=np.float32, res_type='kaiser_best',
    backends=[audioread.ffdec.FFmpegAudioFile]):
    """Load audio. Copied from librosa.core.load() except that ffmpeg backend is
    always used in this function."""

    y = []
    with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file:
        sr_native = input_file.samplerate
        n_channels = input_file.channels

        s_start = int(np.round(sr_native * offset)) * n_channels

        if duration is None:
            s_end = np.inf
        else:
            s_end = s_start + (int(np.round(sr_native * duration))
                               * n_channels)

        n = 0

        for frame in input_file:
            # Modified to use librosa.util.buf_to_float
            frame = librosa.util.buf_to_float(frame, dtype=dtype)
            n_prev = n
            n = n + len(frame)

            if n < s_start:
                # offset is after the current frame
                # keep reading
                continue

            if s_end < n_prev:
                # we're off the end.  stop reading
                break

            if s_end < n:
                # the end is in this frame.  crop.
                frame = frame[:s_end - n_prev]

            if n_prev <= s_start <= n:
                # beginning is in this frame
                frame = frame[(s_start - n_prev):]

            # tack on the current frame
            y.append(frame)

    if y:
        y = np.concatenate(y)

        if n_channels > 1:
            y = y.reshape((-1, n_channels)).T
            if mono:
                y = librosa.to_mono(y)

        if sr is not None:
            y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type)

        else:
            sr = sr_native

    # Final cleanup for dtype and contiguity
    y = np.ascontiguousarray(y, dtype=dtype)

    return (y, sr)

In [5]:
from piano_transcription_inference import PianoTranscription, sample_rate

def transcribe(audio_path, output_midi_path):
    # Load audio
    audio, _ = load_audio(audio_path, sr=sample_rate, mono=True)

    # Transcriptor
    transcriptor = PianoTranscription(device='cpu', checkpoint_path=None)

    # Transcribe and write out to MIDI file
    transcriptor.transcribe(audio, output_midi_path)

In [6]:
!wget https://github.com/qiuqiangkong/piano_transcription_inference/raw/master/resources/cut_liszt.mp3

--2025-11-15 02:52:15--  https://github.com/qiuqiangkong/piano_transcription_inference/raw/master/resources/cut_liszt.mp3
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/qiuqiangkong/piano_transcription_inference/master/resources/cut_liszt.mp3 [following]
--2025-11-15 02:52:16--  https://raw.githubusercontent.com/qiuqiangkong/piano_transcription_inference/master/resources/cut_liszt.mp3
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 240633 (235K) [audio/mpeg]
Saving to: ‘cut_liszt.mp3.2’


2025-11-15 02:52:16 (60.0 MB/s) - ‘cut_liszt.mp3.2’ saved [240633/240633]



In [8]:
# Audio('05_緑の昼下がり.aif', rate=sample_rate)

In [9]:
transcribe('05_緑の昼下がり.aif', '05_緑の昼下がり.mid')

Checkpoint path: /root/piano_transcription_inference_data/note_F1=0.9677_pedal_F1=0.9186.pth
Total size: ~165 MB
Using cuda for inference.
GPU number: 1
Segment 0 / 33
Segment 1 / 33
Segment 2 / 33
Segment 3 / 33
Segment 4 / 33
Segment 5 / 33
Segment 6 / 33
Segment 7 / 33
Segment 8 / 33
Segment 9 / 33
Segment 10 / 33
Segment 11 / 33
Segment 12 / 33
Segment 13 / 33
Segment 14 / 33
Segment 15 / 33
Segment 16 / 33
Segment 17 / 33
Segment 18 / 33
Segment 19 / 33
Segment 20 / 33
Segment 21 / 33
Segment 22 / 33
Segment 23 / 33
Segment 24 / 33
Segment 25 / 33
Segment 26 / 33
Segment 27 / 33
Segment 28 / 33
Segment 29 / 33
Segment 30 / 33
Segment 31 / 33
Segment 32 / 33
Segment 33 / 33
Write out to 05_緑の昼下がり.mid


# Render midi plot

In [None]:
!apt install git

In [None]:
!git clone https://github.com/bytedance/piano_transcription.git

In [None]:
pip install -r piano_transcription/requirements.txt

In [None]:
!mkdir results

In [None]:
import sys
sys.path.append('piano_transcription/utils')
sys.path.append('piano_transcription/pytorch')
from collections import namedtuple

plot_args = namedtuple('PlotArgs', ['audio_path', 'midi_path'])

from plot_for_paper import plot_midi
plot_midi(plot_args('cut_liszt.mp3', 'cut_liszt.mid'))