In [None]:
!pip install ipywidgets tqdm --upgrade
!jupyter nbextension enable --py widgetsnbextension --sys-prefix
!jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

def extract_text_feature(
    text,
    model_name="FacebookAI/xlm-roberta-base",
    model=None,
    tokenizer_name="FacebookAI/xlm-roberta-base",
    tokenizer=None,
    device=None
):
    """
    Mengekstraksi fitur embedding dari teks menggunakan model Transformer seperti XLM-RoBERTa.
    
    Parameters
    ----------
    text : str atau list[str]
        Input teks tunggal atau daftar teks.
    model_name : str, optional
        Nama model pretrained dari Hugging Face (default: "FacebookAI/xlm-roberta-base").
    model : transformers.AutoModel, optional
        Objek model yang sudah dimuat (agar tidak load ulang).
    tokenizer_name : str, optional
        Nama tokenizer pretrained (default: sama seperti model_name).
    tokenizer : transformers.AutoTokenizer, optional
        Objek tokenizer yang sudah dimuat.
    device : str, optional
        'cpu' atau 'cuda'. Jika None, otomatis deteksi GPU bila tersedia.
    
    Returns
    -------
    np.ndarray
        Matriks embedding berdimensi [n_texts, hidden_size].
    """

    # Pastikan input berbentuk list
    if isinstance(text, str):
        text = [text]

    # Deteksi device otomatis
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load tokenizer jika belum ada
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # Load model jika belum ada
    if model is None:
        model = AutoModel.from_pretrained(model_name)

    model = model.to(device)
    model.eval()

    # Tokenisasi input
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state  # [batch, seq_len, hidden_size]

        # Mean pooling (dengan masking)
        attention_mask = inputs["attention_mask"]
        masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)
        sentence_embeddings = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)

    # Pindahkan ke numpy
    embeddings = sentence_embeddings.cpu().numpy()

    return embeddings

extract_text_feature, device

(<function __main__.extract_text_feature(text, model_name='FacebookAI/xlm-roberta-base', model=None, tokenizer_name='FacebookAI/xlm-roberta-base', tokenizer=None, device=None)>,
 'cuda')

In [2]:
model_name = "FacebookAI/xlm-roberta-base"
model_name

'FacebookAI/xlm-roberta-base'

In [3]:
model = AutoModel.from_pretrained(model_name).to(device)
model

2025-11-05 09:12:38.206456: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-05 09:12:38.242139: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-05 09:12:39.106747: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

XLMRobertaTokenizerFast(name_or_path='FacebookAI/xlm-roberta-base', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	250001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)

In [5]:
embeddings = extract_text_feature('This is a story about rafy aa!', model=model, tokenizer=tokenizer)
print(embeddings.shape)

(1, 768)


In [6]:
input_file = "train.csv"
output_file = "raid_xlm-roberta_human-gpt4.jsonl"
filtered_models = ['human', 'gpt4']
text_column = "generation"
model_column = "model"
chunksize = 1

input_file, output_file, filtered_models, text_column, model_column, chunksize

('train.csv',
 'raid_xlm-roberta_human-gpt4.jsonl',
 ['human', 'gpt4'],
 'generation',
 'model',
 1)

In [7]:
# === Hitung ukuran file untuk progress bar ===
total_size = os.path.getsize(input_file)
print(f"Memproses file besar ({total_size / 1e9:.2f} GB): {input_file}")

# === Buat file output kosong ===
open(output_file, "w", encoding="utf-8").close()

progress = tqdm(total=total_size, unit='B', unit_scale=True, desc="Extracting", dynamic_ncols=True)
count = 0

with open(output_file, "a", encoding="utf-8") as out_file:
    with open(input_file, "rb") as f:
        reader = pd.read_csv(f, chunksize=chunksize)
        last_pos = 0

        for chunk in reader:
            current_pos = f.tell()
            progress.update(current_pos - last_pos)
            last_pos = current_pos

            # Filter model target
            filtered = chunk[chunk[model_column].isin(filtered_models)]

            if not filtered.empty:
                for _, row in filtered.iterrows():
                    text = str(row[text_column])
                    features = extract_text_feature(
                        text,
                        model=model,
                        tokenizer=tokenizer,
                        device=device
                    )[0].tolist()

                    record = {
                        "id": row.get("id", None),
                        "model": row[model_column],
                        "domain": row.get("domain", None),
                        "attack": row.get("attack", None),
                        "generation": text,
                        "features": features
                    }

                    out_file.write(json.dumps(record, ensure_ascii=False) + "\n")
                    
                    count += 1

progress.close()
print(f"\n✅ Selesai! Total {count:,} baris disimpan ke {output_file}")

Memproses file besar (11.78 GB): train.csv


Extracting: 100%|█████████████████████████████████████████████████████████████████| 11.8G/11.8G [3:39:50<00:00, 893kB/s]


✅ Selesai! Total 481,356 baris disimpan ke raid_xlm-roberta_human-gpt4.jsonl



