# EKSTRAKSI FITUR LIRIK



In [None]:
# 1. Mount Google Drive
from google.colab import drive
import os
import pandas as pd
import glob
import torch
from transformers import AutoTokenizer, AutoModel

# Lakukan proses autentikasi. Akan muncul prompt untuk mengklik link
# dan memasukkan kode verifikasi.
print("--- Mounting Google Drive... ---")
drive.mount('/content/drive')

# 2. Instalasi Pustaka
!pip install transformers torch pandas
!pip install --upgrade pandas

--- Mounting Google Drive... ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Tentukan path folder lirik Anda di Google Drive
# PERHATIKAN: Pastikan nama foldernya benar
DRIVE_PATH = '/content/drive/MyDrive/DATASETMER/Lyrics'

# Periksa apakah path ada
if not os.path.isdir(DRIVE_PATH):
    print(f"ERROR: Folder '{DRIVE_PATH}' tidak ditemukan. Mohon periksa kembali nama foldernya.")
else:
    print(f"Folder lirik ditemukan di: {DRIVE_PATH}")

    # Cari semua file .txt di folder tersebut
    file_paths = glob.glob(os.path.join(DRIVE_PATH, '*.txt'))

    if not file_paths:
        print("PERINGATAN: Tidak ada file .txt yang ditemukan di folder tersebut.")
    else:
        print(f"Ditemukan {len(file_paths)} file lirik.")

        data_list = []
        for file_path in file_paths:
            try:
                # Baca konten file
                with open(file_path, 'r', encoding='utf-8') as f:
                    lyrics_content = f.read()

                # Ambil nama lagu dari nama file (tanpa ekstensi .txt)
                song_name = os.path.basename(file_path).replace('.txt', '')

                data_list.append({
                    'song_id': song_name,
                    'lyrics': lyrics_content.strip() # Menghapus spasi ekstra di awal/akhir
                })
            except Exception as e:
                print(f"Gagal memproses file {file_path}: {e}")

        df_real = pd.DataFrame(data_list)

        # Hapus baris dengan lirik kosong jika ada
        df_real = df_real[df_real['lyrics'].str.len() > 0].reset_index(drop=True)

        print("\n--- DataFrame Lirik Siap Diproses ---")
        print(df_real.head())
        print(f"Total lirik yang akan diproses: {len(df_real)}")

Folder lirik ditemukan di: /content/drive/MyDrive/DATASETMER/Lyrics
Ditemukan 764 file lirik.

--- DataFrame Lirik Siap Diproses ---
  song_id                                             lyrics
0     113  We've been here too long tryin' to get along\n...
1     105  Hey mama, don't you treat me wrong\nCome and l...
2     216  Monday morning feels so bad,\nEv'rybody seems ...
3     338  When are you gonna come down\nWhen are you goi...
4     372  Fox in the snow, where do you go\nTo find some...
Total lirik yang akan diproses: 764


In [None]:
# 3. Inisialisasi Model BERT
MODEL_NAME = 'bert-base-uncased'

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    print(f"Model BERT ({MODEL_NAME}) siap di {device}")
except Exception as e:
    print(f"Gagal memuat model BERT: {e}")
    # Hentikan eksekusi jika model gagal dimuat
    exit()


def extract_bert_features_from_dataframe(dataframe, lyrics_col='lyrics', batch_size=16, max_length=512):
    """
    Mengekstrak fitur vektor [CLS] dari semua lirik dalam DataFrame dalam bentuk batch.
    """
    if dataframe.empty:
        print("DataFrame kosong. Tidak ada lirik untuk diproses.")
        return None

    lyrics_list = dataframe[lyrics_col].tolist()

    encoded_inputs = tokenizer(
        lyrics_list,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors='pt'
    )

    dataset = torch.utils.data.TensorDataset(
        encoded_inputs['input_ids'],
        encoded_inputs['attention_mask']
    )
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False
    )

    all_features = []

    print(f"Memulai ekstraksi fitur untuk {len(dataframe)} lirik...")
    # Penghitung sederhana untuk memantau kemajuan
    progress = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = batch

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            # Ambil Vektor [CLS] (Pooler Output)
            cls_embedding = outputs.pooler_output

            all_features.append(cls_embedding.cpu().numpy())

            progress += len(input_ids)
            print(f"Diproses: {progress}/{len(dataframe)}", end='\r')

    import numpy as np
    features_array = np.concatenate(all_features, axis=0)

    return features_array

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model BERT (bert-base-uncased) siap di cuda


In [None]:
import torch
print(f"CUDA tersedia: {torch.cuda.is_available()}")

CUDA tersedia: True


In [None]:
!nvidia-smi

Sun Nov 30 11:48:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P0             25W /   70W |     574MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
if 'df_real' in locals() and not df_real.empty:
    # 4. Lakukan ekstraksi fitur
    lyric_features = extract_bert_features_from_dataframe(
        dataframe=df_real,
        lyrics_col='lyrics',
        batch_size=16 # Sesuaikan untuk kecepatan/memori
    )

    if lyric_features is not None:
        print("\n--- Hasil Akhir Ekstraksi Dataset ---")
        print(f"Dimensi Array Fitur: {lyric_features.shape}")
        # Dimensi: (Jumlah_Lagu, 768)

        # 5. Tambahkan fitur kembali ke DataFrame
        df_real['bert_features'] = list(lyric_features)

        print("\nDataFrame dengan Fitur Lirik yang Ditambahkan:")
        print(df_real.head())

        # Opsional: Simpan Fitur yang Diekstrak ke Drive
        # Menyimpan ke format HDF atau Pickle disarankan untuk menyimpan array NumPy
        OUTPUT_PATH = '/content/drive/MyDrive/DATASETMER/bert_features_lyrics.pkl'
        df_real[['song_id', 'bert_features']].to_pickle(OUTPUT_PATH)
        print(f"\nFitur berhasil disimpan ke: {OUTPUT_PATH}")
else:
    print("Ekstraksi fitur tidak dapat dilanjutkan karena DataFrame lirik kosong atau tidak ada.")

Memulai ekstraksi fitur untuk 764 lirik...
Diproses: 764/764
--- Hasil Akhir Ekstraksi Dataset ---
Dimensi Array Fitur: (764, 768)

DataFrame dengan Fitur Lirik yang Ditambahkan:
  song_id                                             lyrics  \
0     113  We've been here too long tryin' to get along\n...   
1     105  Hey mama, don't you treat me wrong\nCome and l...   
2     216  Monday morning feels so bad,\nEv'rybody seems ...   
3     338  When are you gonna come down\nWhen are you goi...   
4     372  Fox in the snow, where do you go\nTo find some...   

                                       bert_features  
0  [-0.44423586, -0.4297126, -0.97955894, 0.40859...  
1  [-0.37149802, -0.48334154, -0.7943978, 0.25322...  
2  [-0.48773482, -0.45445037, -0.88204736, 0.1876...  
3  [-0.86978036, -0.77491546, -0.99366283, 0.8440...  
4  [-0.72125137, -0.6607976, -0.99624836, 0.73752...  

Fitur berhasil disimpan ke: /content/drive/MyDrive/DATASETMER/bert_features_lyrics.pkl


Link FItur: https://drive.google.com/file/d/1_QiL57KZ_n0uExhIjlX35x7QylA-Sv_8/view?usp=sharing
