In [1]:
import time
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, when, count as spark_count, concat_ws, length, rand, row_number
from pyspark.sql.window import Window
import pandas as pd

from pyspark.ml.feature import StringIndexer, RegexTokenizer, StopWordsRemover, Word2Vec
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

def measure_time(func, *args, **kwargs):
    start = time.perf_counter()
    result = func(*args, **kwargs)
    end = time.perf_counter()
    return result, end - start

measure_time

<function __main__.measure_time(func, *args, **kwargs)>

In [2]:
spark = SparkSession.builder \
    .appName("RAID-TRAIN") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

In [3]:
df = spark.read.json("hdfs://namenode:8020/user/raid/filtered_minilm")
df

DataFrame[attack: string, domain: string, features: array<double>, generation: string, id: string, model: string]

# Total Baris

In [4]:
print("Menghitung total jumlah baris...")
total_rows, time_0 = measure_time(lambda: df.count())
print(f"✅ Total baris: {total_rows:,} | Waktu: {time_0:.4f} detik")

Menghitung total jumlah baris...
✅ Total baris: 1,869,542 | Waktu: 287.7117 detik


In [4]:
from pyspark.sql.functions import col, length

# Filter: (1) panjang teks > 50, DAN (2) model hanya 'human' atau 'gpt4'
print("Memfilter data: model IN ('human', 'gpt4')...")
df_filtered, filter_time = measure_time(
    lambda: df.filter(
        (col("model").isin(["human", "gpt4"]))
    )
)
print(f"✅ Filter selesai dalam {filter_time:.4f} detik")

Memfilter data: model IN ('human', 'gpt4')...
✅ Filter selesai dalam 0.0368 detik


In [5]:
df_filtered = df_filtered.repartition(128)
df_filtered

DataFrame[attack: string, domain: string, features: array<double>, generation: string, id: string, model: string]

# Split

In [6]:
print("[1] Menghitung total jumlah baris...")
total_rows, time_0 = measure_time(lambda: df_filtered.count())
print(f"✅ Total baris: {total_rows:,} | Waktu: {time_0:.4f} detik")

[1] Menghitung total jumlah baris...
✅ Total baris: 160,390 | Waktu: 272.7278 detik


In [7]:
print("[2] Membuat kolom stratifikasi 'model_domain_attack' ...")
df_with_strata, strat_time = measure_time(
    lambda: df_filtered.withColumn(
        "model_domain_attack",
        concat_ws("_", col("model"), col("domain"), col("attack"))
    )
)
print(f"✅ Kolom stratifikasi dibuat dalam {strat_time:.4f} detik")

[2] Membuat kolom stratifikasi 'model_domain_attack' ...
✅ Kolom stratifikasi dibuat dalam 0.0233 detik


In [8]:
df_with_strata = df_with_strata.repartition(32)
df_with_strata

DataFrame[attack: string, domain: string, features: array<double>, generation: string, id: string, model: string, model_domain_attack: string]

In [9]:
print("[3] Menghitung jumlah baris per 'model_domain_attack' ...")
df_counts, count_time = measure_time(
    lambda: df_with_strata.groupBy("model_domain_attack").agg(spark_count("*").alias("total_per_group"))
)
print(f"✅ Selesai dalam {count_time:.4f} detik")

[3] Menghitung jumlah baris per 'model_domain_attack' ...
✅ Selesai dalam 0.0443 detik


In [10]:
print("[4] Gabungkan count ke setiap baris")
df_joined, join_time = measure_time(
    lambda: df_with_strata.join(df_counts, on="model_domain_attack", how="inner")
)
print(f"✅ Data berhasil digabung dengan count per grup dalam {join_time:.4f} detik")

[4] Gabungkan count ke setiap baris
✅ Data berhasil digabung dengan count per grup dalam 0.1141 detik


In [11]:
print("[5] Memberi nomor urut acak dalam setiap grup ...")
window_spec = Window.partitionBy("model_domain_attack").orderBy(rand())
df_numbered, number_time = measure_time(
    lambda: df_joined.withColumn("row_num", row_number().over(window_spec))
)
print(f"✅ Penomoran selesai dalam {number_time:.4f} detik")

[5] Memberi nomor urut acak dalam setiap grup ...
✅ Penomoran selesai dalam 0.0452 detik


In [12]:
# [6] Tentukan batas 70% → masuk train jika row_num <= 0.7 * total_per_group
print("[6] Menentukan split berdasarkan 70% per grup ...")
df_with_split, split_time = measure_time(
    lambda: df_numbered.withColumn(
        "is_train",
        col("row_num") <= (col("total_per_group") * 0.7)
    )
)
print(f"✅ Split logic selesai dalam {split_time:.4f} detik")

[6] Menentukan split berdasarkan 70% per grup ...
✅ Split logic selesai dalam 0.0845 detik


In [13]:
# [7] Pisahkan train dan test
# train_df = df_with_split.filter(col("is_train")).select("model", "features")
# test_df = df_with_split.filter(~col("is_train")).select("model", "features")

train_df = df_with_split.filter(col("is_train")).select(
    col("generation").alias("text"),
    col("model"),
    col("features")
)
test_df = df_with_split.filter(~col("is_train")).select(
    col("generation").alias("text"),
    col("model"),
    col("features")
)

train_count, train_count_time = measure_time(lambda: train_df.count())
test_count, test_count_time = measure_time(lambda: test_df.count())

print(f"✅ Train: {train_count:,} ({train_count_time:.4f} detik) | Test: {test_count:,} ({test_count_time:.4f} detik)")

✅ Train: 112,247 (295.9162 detik) | Test: 48,143 (276.8737 detik)


In [14]:
from pyspark.ml.functions import array_to_vector

train_df = train_df.withColumn("features", array_to_vector(col("features")))
test_df = test_df.withColumn("features", array_to_vector(col("features")))

# Opsional: cek tipe kolom
train_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- model: string (nullable = true)
 |-- features: vector (nullable = true)



# Train

In [15]:
from pyspark.ml.feature import StringIndexer, RegexTokenizer, StopWordsRemover, Word2Vec
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

train_df, test_df

(DataFrame[text: string, model: string, features: vector],
 DataFrame[text: string, model: string, features: vector])

In [16]:
# === Pipeline: StringIndexer + LogisticRegression ===
print("Membangun pipeline...")

# StringIndexer untuk label 'model'
string_indexer = StringIndexer(
    inputCol="model",
    outputCol="label",
    handleInvalid="error"
)

# Logistic Regression langsung pada fitur numerik (384-d)
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    family="multinomial",
    regParam=0.01,      # regularisasi ringan
    maxIter=100,
    tol=1e-6
)

pipeline = Pipeline(stages=[string_indexer, lr])
pipeline

Membangun pipeline...


Pipeline_8b24ecad31a5

In [17]:
# --- 3. Latih model ---
print("[14] Melatih model pada train_df...")
model, train_model_time = measure_time(
    lambda: pipeline.fit(train_df)
)
print(f"✅ Model berhasil dilatih dalam {train_model_time:.4f} detik")

[14] Melatih model pada train_df...
✅ Model berhasil dilatih dalam 2929.5631 detik


In [18]:
# --- 4. Simpan model ke HDFS ---
model_path = "hdfs://namenode:8020/user/raid/model-sbert-lr_human-gpt4"
print(f"[15] Menyimpan model ke HDFS: {model_path}...")
_, save_time = measure_time(
    lambda: model.write().overwrite().save(model_path)
)
print(f"✅ Model disimpan dalam {save_time:.4f} detik")

[15] Menyimpan model ke HDFS: hdfs://namenode:8020/user/raid/model-sbert-lr_human-gpt4...
✅ Model disimpan dalam 9.8552 detik


In [19]:
# --- 5. Prediksi pada test set ---
print("[16] Melakukan prediksi pada test_df...")
predictions, predict_time = measure_time(
    lambda: model.transform(test_df)
)
print(f"✅ Prediksi selesai dalam {predict_time:.4f} detik")

# Tampilkan contoh
predictions.select("text", "model", "label", "prediction", "probability").show(5, truncate=50)

[16] Melakukan prediksi pada test_df...
✅ Prediksi selesai dalam 0.0449 detik
+--------------------------------------------------+-----+-----+----------+----------------------------------------+
|                                              text|model|label|prediction|                             probability|
+--------------------------------------------------+-----+-----+----------+----------------------------------------+
|Ingredients:\n\nFor the Crust:\n- 1 1/2 cups gr...| gpt4|  0.0|       0.0|[0.8912502556999004,0.10874974430009975]|
|Ingredients:\n\nFor Bread:\n\n- 1 loaf of crust...| gpt4|  0.0|       0.0|[0.9163321063701791,0.08366789362982094]|
|Ingredients:\n\n- 1 whole turkey (about 3-4 kil...| gpt4|  0.0|       0.0|  [0.858609713005238,0.1413902869947619]|
|ingredients:\n\n- 2 everything bagels\n- 4 slic...| gpt4|  0.0|       0.0|[0.9353020594723525,0.06469794052764764]|
|ingredients:\n\n- 2 pork tenderloins (about 1 o...| gpt4|  0.0|       0.0|[0.8457616500697258,0.154238

In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# --- 6. Evaluasi berbagai metrik ---
print("[17] Evaluasi...")

# Accuracy
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy, time_acc = measure_time(lambda: evaluator_acc.evaluate(predictions))

# Weighted Precision
evaluator_prec = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedPrecision"
)
precision, time_prec = measure_time(lambda: evaluator_prec.evaluate(predictions))

# Weighted Recall
evaluator_rec = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedRecall"
)
recall, time_rec = measure_time(lambda: evaluator_rec.evaluate(predictions))

# F1-score (weighted)
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)
f1_score, time_f1 = measure_time(lambda: evaluator_f1.evaluate(predictions))

# Tampilkan hasil
print(f"✅ Accuracy    : {accuracy:.4f} | Waktu: {time_acc:.4f} detik")
print(f"✅ Precision   : {precision:.4f} | Waktu: {time_prec:.4f} detik")
print(f"✅ Recall      : {recall:.4f} | Waktu: {time_rec:.4f} detik")
print(f"✅ F1-score    : {f1_score:.4f} | Waktu: {time_f1:.4f} detik")

[17] Evaluasi...
✅ Accuracy    : 0.8791 | Waktu: 636.8524 detik
✅ Precision   : 0.8766 | Waktu: 645.7478 detik
✅ Recall      : 0.8789 | Waktu: 675.2073 detik
✅ F1-score    : 0.8765 | Waktu: 640.6882 detik
