In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, rand, row_number, count as spark_count
from pyspark.sql.window import Window

def measure_time(func, *args, **kwargs):
    start = time.perf_counter()
    result = func(*args, **kwargs)
    end = time.perf_counter()
    return result, end - start

measure_time

<function __main__.measure_time(func, *args, **kwargs)>

In [2]:
spark = SparkSession.builder \
    .appName("RAID-SPLIT") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

In [3]:
input_path = "hdfs://namenode:8020/user/raid/filtered"
print(f"[1] Membaca data filtered dari: {input_path} ...")
df_filtered, read_time = measure_time(
    lambda: spark.read.json(input_path)
)
print(f"✅ Berhasil membaca dalam {read_time:.4f} detik | Total baris: {df_filtered.count():,}")

[1] Membaca data filtered dari: hdfs://namenode:8020/user/raid/filtered ...
✅ Berhasil membaca dalam 10.6381 detik | Total baris: 5,610,609


In [4]:
print("[2] Membuat kolom stratifikasi 'model_domain_attack' ...")
df_with_strata, strat_time = measure_time(
    lambda: df_filtered.withColumn(
        "model_domain_attack",
        concat_ws("_", col("model"), col("domain"), col("attack"))
    )
)
print(f"✅ Kolom stratifikasi dibuat dalam {strat_time:.4f} detik")

[2] Membuat kolom stratifikasi 'model_domain_attack' ...
✅ Kolom stratifikasi dibuat dalam 0.0372 detik


In [5]:
df_with_strata = df_with_strata.repartition(32)
df_with_strata

DataFrame[adv_source_id: string, attack: string, decoding: string, domain: string, generation: string, id: string, model: string, prompt: string, repetition_penalty: string, source_id: string, title: string, model_domain_attack: string]

In [6]:
print("[3] Menghitung jumlah baris per 'model_domain_attack' ...")
df_counts, count_time = measure_time(
    lambda: df_with_strata.groupBy("model_domain_attack").agg(spark_count("*").alias("total_per_group"))
)
print(f"✅ Selesai dalam {count_time:.4f} detik")

[3] Menghitung jumlah baris per 'model_domain_attack' ...
✅ Selesai dalam 0.0228 detik


In [7]:
print("[4] Gabungkan count ke setiap baris")
df_joined, join_time = measure_time(
    lambda: df_with_strata.join(df_counts, on="model_domain_attack", how="inner")
)
print(f"✅ Data berhasil digabung dengan count per grup dalam {join_time:.4f} detik")

[4] Gabungkan count ke setiap baris
✅ Data berhasil digabung dengan count per grup dalam 0.0384 detik


In [8]:
print("[5] Memberi nomor urut acak dalam setiap grup ...")
window_spec = Window.partitionBy("model_domain_attack").orderBy(rand())
df_numbered, number_time = measure_time(
    lambda: df_joined.withColumn("row_num", row_number().over(window_spec))
)
print(f"✅ Penomoran selesai dalam {number_time:.4f} detik")

[5] Memberi nomor urut acak dalam setiap grup ...
✅ Penomoran selesai dalam 0.0200 detik


In [9]:
# [6] Tentukan batas 70% → masuk train jika row_num <= 0.7 * total_per_group
print("[6] Menentukan split berdasarkan 70% per grup ...")
df_with_split, split_time = measure_time(
    lambda: df_numbered.withColumn(
        "is_train",
        col("row_num") <= (col("total_per_group") * 0.7)
    )
)
print(f"✅ Split logic selesai dalam {split_time:.4f} detik")

[6] Menentukan split berdasarkan 70% per grup ...
✅ Split logic selesai dalam 0.0223 detik


In [10]:
# [7] Pisahkan train dan test
print("[7] Memisahkan train dan test ...")
train_df = df_with_split.filter(col("is_train")).drop("row_num", "total_per_group", "is_train")
test_df = df_with_split.filter(~col("is_train")).drop("row_num", "total_per_group", "is_train")

train_count, train_time = measure_time(lambda: train_df.count())
test_count, test_time = measure_time(lambda: test_df.count())

print(f"✅ Train: {train_count:,} baris (dalam {train_time:.4f} detik)")
print(f"✅ Test:  {test_count:,} baris (dalam {test_time:.4f} detik)")

[7] Memisahkan train dan test ...
✅ Train: 3,926,925 baris (dalam 13.8295 detik)
✅ Test:  1,683,684 baris (dalam 9.4110 detik)


In [11]:
# [8] Hitung distribusi di train
print("[8] Menghitung distribusi 'model_domain_attack' di train...")
train_dist = train_df.groupBy("model_domain_attack").count().withColumnRenamed("count", "train_count")

# [9] Hitung distribusi di test
print("[9] Menghitung distribusi 'model_domain_attack' di test...")
test_dist = test_df.groupBy("model_domain_attack").count().withColumnRenamed("count", "test_count")

# [10] Gabungkan train dan test berdasarkan label
print("[10] Menggabungkan distribusi train dan test...")
from pyspark.sql.functions import coalesce, lit, round as spark_round

# Hitung total keseluruhan untuk persentase
total_train = train_count
total_test = test_count

# Join full outer agar semua label muncul (meski hanya di train atau test)
combined = train_dist.join(test_dist, on="model_domain_attack", how="full_outer") \
    .fillna(0, subset=["train_count", "test_count"])

# Urutkan berdasarkan total frekuensi (opsional)
combined_sorted = combined.orderBy(col("train_count").desc())

# Ambil sebagai Pandas DataFrame untuk tampilan
print("[11] Mengambil hasil untuk ditampilkan...")
result_pd, fetch_time = measure_time(
    lambda: combined_sorted.toPandas()
)
print(f"✅ Berhasil mengambil {len(result_pd):,} kelas dalam {fetch_time:.4f} detik")

[8] Menghitung distribusi 'model_domain_attack' di train...
[9] Menghitung distribusi 'model_domain_attack' di test...
[10] Menggabungkan distribusi train dan test...
[11] Mengambil hasil untuk ditampilkan...
✅ Berhasil mengambil 1,152 kelas dalam 14.8992 detik


In [12]:
result_pd

Unnamed: 0,model_domain_attack,train_count,test_count
0,llama-chat_books_alternative_spelling,4986,2138
1,llama-chat_books_synonym,4986,2138
2,mpt-chat_books_whitespace,4986,2138
3,mistral-chat_books_insert_paragraphs,4986,2138
4,mistral-chat_books_synonym,4986,2138
...,...,...,...
1147,human_reviews_perplexity_misspelling,660,283
1148,human_reviews_number,660,283
1149,human_reviews_upper_lower,660,283
1150,human_reviews_alternative_spelling,660,283


In [None]:
# Jumlah partisi
NUM_PARTITIONS_TRAIN = 22
NUM_PARTITIONS_TEST = 10

output_train = "hdfs://namenode:8020/user/raid/train"
output_test = "hdfs://namenode:8020/user/raid/test"

print(f"[12] Menyimpan train ke: {output_train} (dalam {NUM_PARTITIONS_TRAIN} partisi) ...")
_, save_train_time = measure_time(
    lambda: train_df
        .repartition(NUM_PARTITIONS_TRAIN)
        .write
        .mode("overwrite")
        .json(output_train)
)
print(f"✅ Train disimpan dalam {save_train_time:.4f} detik")

print(f"[13] Menyimpan test ke: {output_test} (dalam {NUM_PARTITIONS_TEST} partisi) ...")
_, save_test_time = measure_time(
    lambda: test_df
        .repartition(NUM_PARTITIONS_TEST)
        .write
        .mode("overwrite")
        .json(output_test)
)
print(f"✅ Test disimpan dalam {save_test_time:.4f} detik")

[12] Menyimpan train ke: hdfs://namenode:8020/user/raid/train (dalam 22 partisi) ...
