## EKSTRAKSI FITUR MIDI

In [None]:
!pip install miditoolkit mido
!pip install transformers torch pandas
# Anda mungkin perlu menginstal pustaka yang mengimplementasikan REMI Transformer
# (Misalnya, jika ada repositori GitHub yang spesifik, Anda harus clone atau install)

import os
import glob
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import miditoolkit
import numpy as np

# Mounting Google Drive (Pastikan sudah dilakukan sebelumnya)
from google.colab import drive
drive.mount('/content/drive')

Collecting miditoolkit
  Downloading miditoolkit-1.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting mido
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading miditoolkit-1.0.1-py3-none-any.whl (24 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mido, miditoolkit
Successfully installed miditoolkit-1.0.1 mido-1.3.3
Mounted at /content/drive


In [None]:
def midi_to_remi_tokens(midi_path, max_length=512):
    """
    Simulasi konversi file MIDI ke urutan token REMI.

    CATATAN: Implementasi REMI yang sebenarnya sangat detail
    dan memerlukan pustaka khusus. Di sini, kita melakukan konversi
    sederhana yang mewakili urutan event.
    """
    try:
        midi_obj = miditoolkit.midi.parser.MidiFile(midi_path)

        events = []
        # Gabungkan semua track
        for track in midi_obj.instruments:
            for note in track.notes:
                # Event Sederhana: (Pitch, Start Time)
                events.append((note.pitch, note.start))

        # Urutkan berdasarkan waktu
        events.sort(key=lambda x: x[1])

        # Contoh representasi token sederhana:
        # Kita hanya mengambil Pitch dari 512 event pertama (seperti token pada lirik)
        token_sequence = [str(e[0]) for e in events[:max_length]]

        return " ".join(token_sequence)

    except Exception as e:
        print(f"Gagal memproses MIDI {midi_path}: {e}")
        return ""

def load_midi_dataset(midi_dir):
    """Memuat semua file .mid dari direktori dan membuat DataFrame."""

    DRIVE_PATH = os.path.join('/content/drive/MyDrive', midi_dir)
    file_paths = glob.glob(os.path.join(DRIVE_PATH, '*.mid'))

    data_list = []
    print(f"Ditemukan {len(file_paths)} file MIDI...")

    for file_path in file_paths:
        song_id = os.path.basename(file_path).replace('.mid', '')
        # Konversi ke REMI Tokens
        remi_tokens = midi_to_remi_tokens(file_path)

        if remi_tokens:
            data_list.append({
                'song_id': song_id,
                'remi_sequence': remi_tokens
            })

    return pd.DataFrame(data_list)

# Jalankan loading data
midi_df = load_midi_dataset('DATASETMER/MIDIs')
print("\nDataFrame MIDI Siap:")
print(midi_df.head())

Ditemukan 196 file MIDI...
Gagal memproses MIDI /content/drive/MyDrive/DATASETMER/MIDIs/097.mid: data byte must be in range 0..127
Gagal memproses MIDI /content/drive/MyDrive/DATASETMER/MIDIs/009.mid: data byte must be in range 0..127

DataFrame MIDI Siap:
  song_id                                      remi_sequence
0     231  65 60 57 54 36 54 65 60 57 54 54 40 54 54 65 6...
1     209  42 42 42 42 42 42 35 47 54 40 52 52 54 40 71 6...
2     268  50 57 62 38 49 36 48 55 60 36 36 42 43 47 50 5...
3     286  42 42 42 42 48 56 42 36 42 42 45 42 36 56 42 4...
4     369  65 62 69 65 69 74 74 69 69 74 74 69 69 74 74 6...


In [None]:
# --- Asumsi: Menggunakan BERT sebagai arsitektur Transformer (Hanya untuk demonstrasi) ---
# Jika Anda punya model REMI Transformer, ganti MODEL_NAME dan Model Class-nya.

MIDI_TRANSFORMER_NAME = 'bert-base-uncased' # Ganti dengan model REMI Anda
tokenizer = AutoTokenizer.from_pretrained(MIDI_TRANSFORMER_NAME)
model = AutoModel.from_pretrained(MIDI_TRANSFORMER_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"Model Transformer MIDI siap di {device}")


def extract_midi_features_from_dataframe(dataframe, sequence_col, batch_size=16, max_length=512):
    """Mengekstrak fitur vektor [CLS] dari urutan REMI menggunakan Transformer."""

    sequences = dataframe[sequence_col].tolist()

    # Tokenisasi Urutan MIDI (yang sudah disimulasikan sebagai string token)
    encoded_inputs = tokenizer(
        sequences,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors='pt'
    )

    # Siapkan DataLoader (Sama seperti ekstraksi lirik)
    dataset = torch.utils.data.TensorDataset(
        encoded_inputs['input_ids'],
        encoded_inputs['attention_mask']
    )
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_features = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = batch

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            # Ambil Vektor [CLS] (Pooler Output) - Representasi seluruh MIDI
            cls_embedding = outputs.pooler_output
            all_features.append(cls_embedding.cpu().numpy())

    features_array = np.concatenate(all_features, axis=0)
    return features_array

# --- Jalankan Ekstraksi ---
if not midi_df.empty:
    midi_features = extract_midi_features_from_dataframe(
        dataframe=midi_df,
        sequence_col='remi_sequence',
        batch_size=16
    )

    midi_df['midi_features'] = list(midi_features)

    print("\n--- Hasil Akhir Ekstraksi Fitur MIDI ---")
    print(f"Dimensi Array Fitur: {midi_features.shape}")
    print(midi_df.head())

    # Simpan hasil
    OUTPUT_PATH = '/content/drive/MyDrive/DATASETMER/midi_features_remi.pkl'
    midi_df[['song_id', 'midi_features']].to_pickle(OUTPUT_PATH)
    print(f"\nFitur MIDI berhasil disimpan ke: {OUTPUT_PATH}")
else:
    print("DataFrame MIDI kosong. Tidak ada fitur yang diekstrak.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model Transformer MIDI siap di cuda

--- Hasil Akhir Ekstraksi Fitur MIDI ---
Dimensi Array Fitur: (194, 768)
  song_id                                      remi_sequence  \
0     231  65 60 57 54 36 54 65 60 57 54 54 40 54 54 65 6...   
1     209  42 42 42 42 42 42 35 47 54 40 52 52 54 40 71 6...   
2     268  50 57 62 38 49 36 48 55 60 36 36 42 43 47 50 5...   
3     286  42 42 42 42 48 56 42 36 42 42 45 42 36 56 42 4...   
4     369  65 62 69 65 69 74 74 69 69 74 74 69 69 74 74 6...   

                                       midi_features  
0  [-0.41660434, -0.5257481, -0.58104336, 0.39111...  
1  [-0.3378141, -0.4824051, -0.7604141, 0.2396558...  
2  [-0.38670248, -0.5405592, -0.7906979, 0.314394...  
3  [-0.29987583, -0.513034, -0.76374316, 0.293099...  
4  [0.033382602, -0.486103, -0.9400674, 0.0877205...  

Fitur MIDI berhasil disimpan ke: /content/drive/MyDrive/DATASETMER/midi_features_remi.pkl


LINK FITUR MIDI: https://drive.google.com/file/d/1598Sa39PIbSBZ-N6HCjih5Z7hOwfbf9w/view?usp=sharing