In [None]:
!pip install transformers

In [None]:
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu

In [1]:
import os
import time
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.feature import StringIndexerModel
from pyspark.ml import PipelineModel
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.functions import array_to_vector

def measure_time(func, *args, **kwargs):
    start = time.perf_counter()
    result = func(*args, **kwargs)
    end = time.perf_counter()
    return result, end - start

measure_time

<function __main__.measure_time(func, *args, **kwargs)>

In [2]:
# ------------------------------------------------------------
# 1️⃣ Setup Spark Session
# ------------------------------------------------------------
# spark, t_spark = measure_time(
#     lambda: SparkSession.builder
#         .appName("RAID-INFERENCE")
#         .master("spark://spark-master:7077")
#         .getOrCreate()
# )
# print(f"✅ Spark session aktif (waktu: {t_spark:.4f} detik)")

In [3]:
spark, time_spark = measure_time(
    lambda: SparkSession.builder
        .appName("RAID-INFERENCE")
        .master("local[*]")
        .getOrCreate()
)
print(f"✅ Spark session aktif (waktu: {time_spark:.4f} detik)")

✅ Spark session aktif (waktu: 2.2259 detik)


In [6]:
MODEL_PATH = os.path.abspath("model-minilm-lr_human-gpt4")
print(f"[DEBUG] Path absolut model: {MODEL_PATH}")

[DEBUG] Path absolut model: /home/jovyan/work/model-minilm-lr_human-gpt4


In [7]:
# ------------------------------------------------------------
# 2️⃣ Load Pretrained Model dari Direktori Lokal
# ------------------------------------------------------------
print(f"[2] Memuat model dari lokal: {MODEL_PATH} ...")
model, time_model = measure_time(lambda: PipelineModel.load(MODEL_PATH))
print(f"✅ Model berhasil dimuat dalam {time_model:.4f} detik.")

[2] Memuat model dari lokal: /home/jovyan/work/model-minilm-lr_human-gpt4 ...
✅ Model berhasil dimuat dalam 4.1381 detik.


In [8]:
# ------------------------------------------------------------
# 3️⃣ Load Sentence Transformer (MiniLM)
# ------------------------------------------------------------
print("[3] Memuat tokenizer & model MiniLM untuk ekstraksi fitur...")

def load_miniLM():
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    encoder = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    encoder.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder.to(device)
    return tokenizer, encoder, device

(tokenizer, encoder, device), time_minilm = measure_time(load_miniLM)
print(f"✅ MiniLM siap dalam {time_minilm:.4f} detik (device: {device}).")

[3] Memuat tokenizer & model MiniLM untuk ekstraksi fitur...
✅ MiniLM siap dalam 1.2988 detik (device: cpu).


In [9]:
# ------------------------------------------------------------
# 4️⃣ Fungsi get_embedding()
# ------------------------------------------------------------
def get_embedding(text: str):
    """Ekstraksi embedding 384-dimensi dari teks menggunakan MiniLM"""
    if pd.isna(text) or text.strip() == "":
        return [0.0] * 384
    encoded = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=512
    )
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        output = encoder(**encoded)
    token_embeddings = output.last_hidden_state
    attention_mask = encoded['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings.cpu().numpy()[0].tolist()
get_embedding

<function __main__.get_embedding(text: str)>

In [10]:
# ------------------------------------------------------------
# 5️⃣ Fungsi classify_text(text)
# ------------------------------------------------------------
def classify_text(text: str):
    # [a] Embedding
    emb, time_emb = measure_time(lambda: get_embedding(text))

    # [b] Konversi ke Spark DataFrame
    df, time_df = measure_time(lambda: spark.createDataFrame([Row(features=Vectors.dense(emb))]))

    # [c] Prediksi
    pred_row, time_pred = measure_time(lambda: model.transform(df).select("prediction", "probability").collect()[0])

    # [d] Interpretasi hasil
    label_indexer = model.stages[0]  # StringIndexer pertama dalam pipeline
    labels = label_indexer.labels    # contoh: ['gpt4', 'human']

    predicted_index = int(pred_row['prediction'])
    predicted_label = labels[predicted_index]
    prob_vector = pred_row['probability']
    prob_dict = {labels[i]: float(prob_vector[i]) for i in range(len(labels))}

    time_total = time_emb + time_df + time_pred

    return {
        "input_text": text[:100] + ("..." if len(text) > 100 else ""),
        "predicted_label": predicted_label,
        "probabilities": prob_dict,
        "timing": {
            "embedding": round(time_emb, 4),
            "dataframe": round(time_df, 4),
            "prediction": round(time_pred, 4),
            "total": round(time_total, 4)
        }
    }

classify_text

<function __main__.classify_text(text: str)>

In [12]:
# ------------------------------------------------------------
# 6️⃣ Contoh Pengujian
# ------------------------------------------------------------
sample_text = "In this paper, we present a semi-supervised learning algorithm for classification of text documents. A method of labeling unlabeled text documents is presented. The presented method is based on the principle of divide and conquer strategy. It uses recursive K-means algorithm for partitioning both labeled and unlabeled data collection. The K-means algorithm is applied recursively on each partition till a desired level partition is achieved such that each partition contains labeled documents of a single class. Once the desired clusters are obtained, the respective cluster centroids are considered as representatives of the clusters and the nearest neighbor rule is used for classifying an unknown text document. Series of experiments have been conducted to bring out the superiority of the proposed model over other recent state of the art models on 20Newsgroups dataset."

print("[6] Menguji teks...")
result, result_time = measure_time(lambda: classify_text(sample_text))
print(f"✅ Selesai dalam {result_time:.4f} detik\n")

result

[6] Menguji teks...
✅ Selesai dalam 0.3049 detik



{'input_text': 'In this paper, we present a semi-supervised learning algorithm for classification of text documents....',
 'predicted_label': 'human',
 'probabilities': {'gpt4': 0.13300648190988193, 'human': 0.866993518090118},
 'timing': {'embedding': 0.0229,
  'dataframe': 0.0147,
  'prediction': 0.2652,
  'total': 0.3028}}