In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.6/486.6 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.1.1
[0m

In [1]:
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import json
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
print("Memuat model sentence-transformers...")
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model.eval()  # Set ke mode evaluasi
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model

Memuat model sentence-transformers...


2025-10-22 11:16:35.954912: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-22 11:16:35.986162: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-22 11:16:36.717743: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [11]:
# === Konfigurasi ===
input_file = "train.csv"
output_file = "raid-filtered.jsonl"
targets = ['none', 'paraphrase', 'synonym', 'alternative_spelling']

In [5]:
# === Hitung total baris untuk tqdm ===
print("Menghitung total baris...")
total_lines = sum(1 for _ in open(input_file, 'r', encoding='utf-8')) - 1  # -1 untuk header
print(f"Total baris: {total_lines:,}")

Menghitung total baris...
Total baris: 75,840,050


In [12]:
def get_embedding(text: str):
    if pd.isna(text) or text.strip() == "":
        # Jika teks kosong/NaN, kembalikan vektor nol dengan dimensi 384
        return [0.0] * 384
    # Tokenisasi
    encoded = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=512
    )
    # Pindahkan ke device
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        output = model(**encoded)
    # Mean pooling
    token_embeddings = output.last_hidden_state
    attention_mask = encoded['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings.cpu().numpy()[0].tolist()  # Konversi ke list Python

# === Proses per baris ===
print("Memulai pemrosesan dan penyaringan...")
with open(output_file, "w", encoding="utf-8") as out_file:
    with tqdm(total=total_lines, desc="Processing rows", unit="row") as pbar:
        for chunk in pd.read_csv(input_file, chunksize=1):
            row = chunk.iloc[0]

            # Filter berdasarkan 'attack'
            if row['attack'] not in targets:
                pbar.update(1)
                continue

            gen_text = row['generation']

            # Pastikan generation adalah string dan panjang > 50
            if not isinstance(gen_text, str) or len(gen_text) <= 50:
                pbar.update(1)
                continue

            # Ekstrak embedding
            try:
                feat = get_embedding(gen_text)
            except Exception as e:
                print(f"\n⚠️ Error embedding baris id={row['id']}: {e}")
                feat = [0.0] * 384

            record = {
                "id": row["id"],
                "model": row["model"],
                "domain": row["domain"],
                "attack": row["attack"],
                "generation": gen_text,
                "features": feat
            }

            out_file.write(json.dumps(record, ensure_ascii=False) + "\n")
            pbar.update(1)

print(f"\n✅ Selesai! Hasil disimpan ke {output_file}")

Memulai pemrosesan dan penyaringan...


Processing rows:   7%|███▌                                            | 5615820/75840050 [3:30:35<43:53:18, 444.46row/s]


✅ Selesai! Hasil disimpan ke raid-filtered.jsonl



