In [None]:
!pip install transformers

In [None]:
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu

In [1]:
import os
import time
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.feature import StringIndexerModel
from pyspark.ml import PipelineModel
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.functions import array_to_vector

def measure_time(func, *args, **kwargs):
    start = time.perf_counter()
    result = func(*args, **kwargs)
    end = time.perf_counter()
    return result, end - start

measure_time

<function __main__.measure_time(func, *args, **kwargs)>

In [5]:
# ------------------------------------------------------------
# 1️⃣ Setup Spark Session
# ------------------------------------------------------------
spark, t_spark = measure_time(
    lambda: SparkSession.builder
        .appName("RAID-INFERENCE")
        .master("spark://spark-master:7077")
        .getOrCreate()
)
print(f"✅ Spark session aktif (waktu: {t_spark:.4f} detik)")

✅ Spark session aktif (waktu: 2.4786 detik)


In [2]:
spark, time_spark = measure_time(
    lambda: SparkSession.builder
        .appName("RAID-INFERENCE")
        .master("local[*]")
        .getOrCreate()
)
print(f"✅ Spark session aktif (waktu: {time_spark:.4f} detik)")

✅ Spark session aktif (waktu: 1.7506 detik)


In [3]:
MODEL_PATH = os.path.abspath("backup_model-sbert-lr_human-gpt4")
print(f"[DEBUG] Path absolut model: {MODEL_PATH}")

[DEBUG] Path absolut model: /home/jovyan/work/backup_model-sbert-lr_human-gpt4


In [4]:
# ------------------------------------------------------------
# 2️⃣ Load Pretrained Model dari Direktori Lokal
# ------------------------------------------------------------
print(f"[2] Memuat model dari lokal: {MODEL_PATH} ...")
model, time_model = measure_time(lambda: PipelineModel.load(MODEL_PATH))
print(f"✅ Model berhasil dimuat dalam {time_model:.4f} detik.")

[2] Memuat model dari lokal: /home/jovyan/work/backup_model-sbert-lr_human-gpt4 ...
✅ Model berhasil dimuat dalam 4.2590 detik.


In [5]:
# ------------------------------------------------------------
# 3️⃣ Load Sentence Transformer (MiniLM)
# ------------------------------------------------------------
print("[3] Memuat tokenizer & model MiniLM untuk ekstraksi fitur...")

def load_miniLM():
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    encoder = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    encoder.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder.to(device)
    return tokenizer, encoder, device

(tokenizer, encoder, device), time_minilm = measure_time(load_miniLM)
print(f"✅ MiniLM siap dalam {time_minilm:.4f} detik (device: {device}).")

[3] Memuat tokenizer & model MiniLM untuk ekstraksi fitur...


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

✅ MiniLM siap dalam 12.1216 detik (device: cpu).


In [6]:
# ------------------------------------------------------------
# 4️⃣ Fungsi get_embedding()
# ------------------------------------------------------------
def get_embedding(text: str):
    """Ekstraksi embedding 384-dimensi dari teks menggunakan MiniLM"""
    if pd.isna(text) or text.strip() == "":
        return [0.0] * 384
    encoded = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=512
    )
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        output = encoder(**encoded)
    token_embeddings = output.last_hidden_state
    attention_mask = encoded['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings.cpu().numpy()[0].tolist()
get_embedding

<function __main__.get_embedding(text: str)>

In [9]:
# ------------------------------------------------------------
# 5️⃣ Fungsi classify_text(text)
# ------------------------------------------------------------
def classify_text(text: str):
    # [a] Embedding
    emb, time_emb = measure_time(lambda: get_embedding(text))

    # [b] Konversi ke Spark DataFrame
    df, time_df = measure_time(lambda: spark.createDataFrame([Row(features=Vectors.dense(emb))]))

    # [c] Prediksi
    pred_row, time_pred = measure_time(lambda: model.transform(df).select("prediction", "probability").collect()[0])

    # [d] Interpretasi hasil
    label_indexer = model.stages[0]  # StringIndexer pertama dalam pipeline
    labels = label_indexer.labels    # contoh: ['gpt4', 'human']

    predicted_index = int(pred_row['prediction'])
    predicted_label = labels[predicted_index]
    prob_vector = pred_row['probability']
    prob_dict = {labels[i]: float(prob_vector[i]) for i in range(len(labels))}

    time_total = time_emb + time_df + time_pred

    return {
        "input_text": text[:100] + ("..." if len(text) > 100 else ""),
        "predicted_label": predicted_label,
        "probabilities": prob_dict,
        "timing": {
            "embedding": round(time_emb, 4),
            "dataframe": round(time_df, 4),
            "prediction": round(time_pred, 4),
            "total": round(time_total, 4)
        }
    }

classify_text

<function __main__.classify_text(text: str)>

In [11]:
# ------------------------------------------------------------
# 6️⃣ Contoh Pengujian
# ------------------------------------------------------------
sample_text_gpt4 = "The sun dipped below the horizon, painting the sky with hues of amber and rose."

print("[6] Menguji teks GPT-4...")
result_gpt4, time_gpt4 = measure_time(lambda: classify_text(sample_text_gpt4))
print(f"✅ Selesai dalam {time_gpt4:.4f} detik\n")

result_gpt4

[6] Menguji teks GPT-4...
✅ Selesai dalam 0.2648 detik



{'input_text': 'The sun dipped below the horizon, painting the sky with hues of amber and rose.',
 'predicted_label': 'gpt4',
 'probabilities': {'gpt4': 0.8959879081761304, 'human': 0.1040120918238696},
 'timing': {'embedding': 0.0067,
  'dataframe': 0.0155,
  'prediction': 0.2405,
  'total': 0.2626}}

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 52352)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =