In [1]:
import time
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, when, count as spark_count, concat_ws, length, rand, row_number
from pyspark.sql.window import Window
import pandas as pd

def measure_time(func, *args, **kwargs):
    start = time.perf_counter()
    result = func(*args, **kwargs)
    end = time.perf_counter()
    return result, end - start

measure_time

<function __main__.measure_time(func, *args, **kwargs)>

In [2]:
spark = SparkSession.builder \
    .appName("RAID-TRAIN") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

In [3]:
df = spark.read.json("hdfs://namenode:8020/user/raid/raw")
df

DataFrame[adv_source_id: string, attack: string, decoding: string, domain: string, generation: string, id: string, model: string, prompt: string, repetition_penalty: string, source_id: string, title: string]

# Total Baris

In [4]:
print("Menghitung total jumlah baris...")
total_rows, time_0 = measure_time(lambda: df.count())
print(f"✅ Total baris: {total_rows:,} | Waktu: {time_0:.4f} detik")

Menghitung total jumlah baris...
✅ Total baris: 5,615,820 | Waktu: 411.4781 detik


# Filter

In [4]:
print("Memfilter data: length(generation) > 50...")
df_filtered, filter_time = measure_time(
    lambda: df.filter(length(col("generation")) > 50)
)

print(f"✅ Filter selesai dalam {filter_time:.2f} detik")

Memfilter data: length(generation) > 50...
✅ Filter selesai dalam 0.03 detik


In [6]:
print("Menghitung total jumlah baris setelah filter...")
total_rows, time_1 = measure_time(lambda: df_filtered.count())
print(f"✅ Total baris: {total_rows:,} | Waktu: {time_1:.4f} detik")

Menghitung total jumlah baris setelah filter...
✅ Total baris: 5,610,609 | Waktu: 376.3021 detik


In [5]:
df_filtered = df_filtered.repartition(32)
df_filtered

DataFrame[adv_source_id: string, attack: string, decoding: string, domain: string, generation: string, id: string, model: string, prompt: string, repetition_penalty: string, source_id: string, title: string]

# Split

In [None]:
print(f"[1] Total jumlah baris setelah filter...")
print(f"✅ Total baris: {total_rows:,} | Waktu: {time_1:.4f} detik")

In [7]:
print("[2] Membuat kolom stratifikasi 'model_domain_attack' ...")
df_with_strata, strat_time = measure_time(
    lambda: df_filtered.withColumn(
        "model_domain_attack",
        concat_ws("_", col("model"), col("domain"), col("attack"))
    )
)
print(f"✅ Kolom stratifikasi dibuat dalam {strat_time:.4f} detik")

[2] Membuat kolom stratifikasi 'model_domain_attack' ...
✅ Kolom stratifikasi dibuat dalam 0.1341 detik


In [8]:
df_with_strata = df_with_strata.repartition(32)
df_with_strata

DataFrame[adv_source_id: string, attack: string, decoding: string, domain: string, generation: string, id: string, model: string, prompt: string, repetition_penalty: string, source_id: string, title: string, model_domain_attack: string]

In [9]:
print("[3] Menghitung jumlah baris per 'model_domain_attack' ...")
df_counts, count_time = measure_time(
    lambda: df_with_strata.groupBy("model_domain_attack").agg(spark_count("*").alias("total_per_group"))
)
print(f"✅ Selesai dalam {count_time:.4f} detik")

[3] Menghitung jumlah baris per 'model_domain_attack' ...
✅ Selesai dalam 0.0888 detik


In [10]:
print("[4] Gabungkan count ke setiap baris")
df_joined, join_time = measure_time(
    lambda: df_with_strata.join(df_counts, on="model_domain_attack", how="inner")
)
print(f"✅ Data berhasil digabung dengan count per grup dalam {join_time:.4f} detik")

[4] Gabungkan count ke setiap baris
✅ Data berhasil digabung dengan count per grup dalam 0.1788 detik


In [11]:
print("[5] Memberi nomor urut acak dalam setiap grup ...")
window_spec = Window.partitionBy("model_domain_attack").orderBy(rand())
df_numbered, number_time = measure_time(
    lambda: df_joined.withColumn("row_num", row_number().over(window_spec))
)
print(f"✅ Penomoran selesai dalam {number_time:.4f} detik")

[5] Memberi nomor urut acak dalam setiap grup ...
✅ Penomoran selesai dalam 0.0957 detik


In [12]:
# [6] Tentukan batas 70% → masuk train jika row_num <= 0.7 * total_per_group
print("[6] Menentukan split berdasarkan 70% per grup ...")
df_with_split, split_time = measure_time(
    lambda: df_numbered.withColumn(
        "is_train",
        col("row_num") <= (col("total_per_group") * 0.7)
    )
)
print(f"✅ Split logic selesai dalam {split_time:.4f} detik")

[6] Menentukan split berdasarkan 70% per grup ...
✅ Split logic selesai dalam 0.0655 detik


In [13]:
# [7] Pisahkan train dan test
print("[7] Memisahkan train dan test ...")
train_df = df_with_split.filter(col("is_train")).drop("row_num", "total_per_group", "is_train")
test_df = df_with_split.filter(~col("is_train")).drop("row_num", "total_per_group", "is_train")

train_count, train_time = measure_time(lambda: train_df.count())
test_count, test_time = measure_time(lambda: test_df.count())

print(f"✅ Train: {train_count:,} baris (dalam {train_time:.4f} detik)")
print(f"✅ Test:  {test_count:,} baris (dalam {test_time:.4f} detik)")

[7] Memisahkan train dan test ...
✅ Train: 3,926,925 baris (dalam 301.0543 detik)
✅ Test:  1,683,684 baris (dalam 308.2308 detik)


In [14]:
# [8] Hitung distribusi di train
print("[8] Menghitung distribusi 'model_domain_attack' di train...")
train_dist = train_df.groupBy("model_domain_attack").count().withColumnRenamed("count", "train_count")

# [9] Hitung distribusi di test
print("[9] Menghitung distribusi 'model_domain_attack' di test...")
test_dist = test_df.groupBy("model_domain_attack").count().withColumnRenamed("count", "test_count")

# [10] Gabungkan train dan test berdasarkan label
print("[10] Menggabungkan distribusi train dan test...")
from pyspark.sql.functions import coalesce, lit, round as spark_round

# Hitung total keseluruhan untuk persentase
total_train = train_count
total_test = test_count

# Join full outer agar semua label muncul (meski hanya di train atau test)
combined = train_dist.join(test_dist, on="model_domain_attack", how="full_outer") \
    .fillna(0, subset=["train_count", "test_count"])

# Urutkan berdasarkan total frekuensi (opsional)
combined_sorted = combined.orderBy(col("train_count").desc())

# Ambil sebagai Pandas DataFrame untuk tampilan
print("[11] Mengambil hasil untuk ditampilkan...")
result_pd, fetch_time = measure_time(
    lambda: combined_sorted.toPandas()
)
print(f"✅ Berhasil mengambil {len(result_pd):,} kelas dalam {fetch_time:.4f} detik")

[8] Menghitung distribusi 'model_domain_attack' di train...
[9] Menghitung distribusi 'model_domain_attack' di test...
[10] Menggabungkan distribusi train dan test...
[11] Mengambil hasil untuk ditampilkan...
✅ Berhasil mengambil 1,152 kelas dalam 306.8641 detik


In [15]:
result_pd

Unnamed: 0,model_domain_attack,train_count,test_count
0,llama-chat_books_none,4986,2138
1,llama-chat_books_number,4986,2138
2,llama-chat_books_upper_lower,4986,2138
3,mistral-chat_books_alternative_spelling,4986,2138
4,mistral-chat_books_none,4986,2138
...,...,...,...
1147,human_reviews_none,660,283
1148,human_reviews_perplexity_misspelling,660,283
1149,human_reviews_number,660,283
1150,human_reviews_upper_lower,660,283


# Train

In [16]:
from pyspark.ml.feature import StringIndexer, RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import FloatType

train_df, test_df

(DataFrame[model_domain_attack: string, adv_source_id: string, attack: string, decoding: string, domain: string, generation: string, id: string, model: string, prompt: string, repetition_penalty: string, source_id: string, title: string],
 DataFrame[model_domain_attack: string, adv_source_id: string, attack: string, decoding: string, domain: string, generation: string, id: string, model: string, prompt: string, repetition_penalty: string, source_id: string, title: string])

In [17]:
# --- 1. Siapkan hanya kolom teks (label akan diindex) ---
print("[12] Menyiapkan kolom 'text' dan 'label_string'...")
train_input = train_df.select(
    col("generation").alias("text"),
    col("model").alias("label_string")
)
test_input = test_df.select(
    col("generation").alias("text"),
    col("model").alias("label_string")
)

train_input = train_input.repartition(128)
test_input = test_input.repartition(128)

train_input, test_input

[12] Menyiapkan kolom 'text' dan 'label_string'...


(DataFrame[text: string, label_string: string],
 DataFrame[text: string, label_string: string])

In [18]:
# --- 2. Bangun pipeline dengan StringIndexer ---
print("[13] Membangun pipeline: StringIndexer → TF-IDF → LogisticRegression...")

# StringIndexer: string label → numeric (0, 1, 2, ...)
string_indexer = StringIndexer(
    inputCol="label_string",
    outputCol="label",
    handleInvalid="error"  # atau "skip" jika ada nilai tidak valid
)

# Tokenizer
tokenizer = RegexTokenizer(
    inputCol="text",
    outputCol="words",
    pattern="\\W"
)

# Stopwords remover
stopwords_remover = StopWordsRemover(
    inputCol="words",
    outputCol="filtered"
)

# TF-IDF
hashing_tf = HashingTF(
    inputCol="filtered",
    outputCol="raw_features",
    numFeatures=1000
)

idf = IDF(
    inputCol="raw_features",
    outputCol="features"
)

# Logistic Regression (multinomial)
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    family="multinomial",
    regParam=0.1,
    maxIter=50
)

# Pipeline lengkap
pipeline = Pipeline(stages=[
    string_indexer,
    tokenizer,
    stopwords_remover,
    hashing_tf,
    idf,
    lr
])

pipeline

[13] Membangun pipeline: StringIndexer → TF-IDF → LogisticRegression...


Pipeline_0955261d5a10

In [19]:
# --- 3. Latih model ---
print("[14] Melatih model pada train_input...")
model, train_model_time = measure_time(
    lambda: pipeline.fit(train_input)
)
print(f"✅ Model berhasil dilatih dalam {train_model_time:.4f} detik")

[14] Melatih model pada train_input...
✅ Model berhasil dilatih dalam 7816.5793 detik


In [20]:
# --- 4. Simpan model ke HDFS ---
model_path = "hdfs://namenode:8020/user/raid/model-tfidf-lr"
print(f"[15] Menyimpan model ke HDFS: {model_path}...")
_, save_time = measure_time(
    lambda: model.write().overwrite().save(model_path)
)
print(f"✅ Model disimpan dalam {save_time:.4f} detik")

[15] Menyimpan model ke HDFS: hdfs://namenode:8020/user/raid/model-tfidf-lr...
✅ Model disimpan dalam 13.8640 detik


In [None]:
# --- 5. Prediksi pada test set ---
print("[16] Melakukan prediksi pada test_input...")
predictions, predict_time = measure_time(
    lambda: model.transform(test_input)
)
print(f"✅ Prediksi selesai dalam {predict_time:.2f} detik")

# Tampilkan contoh
predictions.select("text", "label_string", "label", "prediction").show(5, truncate=50)

[16] Melakukan prediksi pada test_input...
✅ Prediksi selesai dalam 0.18 detik


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# --- 6. Evaluasi berbagai metrik ---
print("[17] Evaluasi...")

# Accuracy
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy, time_acc = measure_time(lambda: evaluator_acc.evaluate(predictions))

# Weighted Precision
evaluator_prec = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedPrecision"
)
precision, time_prec = measure_time(lambda: evaluator_prec.evaluate(predictions))

# Weighted Recall
evaluator_rec = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedRecall"
)
recall, time_rec = measure_time(lambda: evaluator_rec.evaluate(predictions))

# F1-score (weighted)
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)
f1_score, time_f1 = measure_time(lambda: evaluator_f1.evaluate(predictions))

# Tampilkan hasil
print(f"✅ Accuracy    : {accuracy:.4f} | Waktu: {time_acc:.4f} detik")
print(f"✅ Precision   : {precision:.4f} | Waktu: {time_prec:.4f} detik")
print(f"✅ Recall      : {recall:.4f} | Waktu: {time_rec:.4f} detik")
print(f"✅ F1-score    : {f1_score:.4f} | Waktu: {time_f1:.4f} detik")

In [None]:
# --- 7. Confusion Matrix (opsional) ---
print("[18] Menyiapkan confusion matrix...")

# Pastikan label dan prediction bertipe numerik
preds_labels = predictions.select(
    col("prediction").cast(FloatType()),
    col("label").cast(FloatType())
).rdd.map(lambda row: (row.prediction, row.label))

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(preds_labels)
cm = metrics.confusionMatrix().toArray()
print(f"✅ Confusion matrix shape: {cm.shape}")

# Simpan untuk visualisasi
import numpy as np
np.save("/home/jovyan/work/cm_tfidf.npy", cm)
print("✅ Confusion matrix disimpan sebagai cm_tfidf.npy")