In [1]:
import subprocess
import time
import os
from pathlib import Path

def measure_time(func, *args, **kwargs):
    start = time.perf_counter()
    result = func(*args, **kwargs)
    end = time.perf_counter()
    return result, end - start

def run_command(cmd: str, capture_output: bool = True, timeout: int = None):
    result = subprocess.run(
        cmd,
        shell=True,
        capture_output=capture_output,
        text=True,
        timeout=timeout,
        check=True
    )
    return result

measure_time, run_command

(<function __main__.measure_time(func, *args, **kwargs)>,
 <function __main__.run_command(cmd: str, capture_output: bool = True, timeout: int = None)>)

In [3]:
# 1. Buat direktori HDFS tujuan
print("Membuat direktori HDFS /user/raid/filtered/ ...")
try:
    _, t = measure_time(
        run_command,
        "docker exec namenode hdfs dfs -mkdir -p /user/raid/filtered"
    )
    print(f"âœ… Direktori dibuat dalam {t:.4f} detik")
except subprocess.CalledProcessError as e:
    if "File exists" in e.stderr:
        print("âœ… Direktori sudah ada, lanjut...")
    else:
        raise e

Membuat direktori HDFS /user/raid/filtered/ ...
âœ… Direktori dibuat dalam 1.2770 detik


In [4]:
# 2. Set permission (opsional, tapi memastikan writeable)
print("Mengatur permission HDFS ke 777 ...")
_, t = measure_time(
    run_command,
    "docker exec namenode hdfs dfs -chmod -R 777 /user/raid/filtered"
)
print(f"âœ… Permission diatur dalam {t:.4f} detik")

Mengatur permission HDFS ke 777 ...
âœ… Permission diatur dalam 1.4256 detik


In [None]:
# 3. Upload semua file .jsonl dari splits-filtered/
local_filtered_dir = "dataset/splits-filtered"
hdfs_target = "/user/raid/filtered"

print(f"Mengunggah file dari {local_filtered_dir} ke HDFS {hdfs_target} ...")

jsonl_files = sorted([f for f in os.listdir(local_filtered_dir) if f.endswith(".jsonl")])
print(f"âœ… Ditemukan {len(jsonl_files)} file .jsonl")

total_uploaded = 0
for filename in jsonl_files:
    local_path = os.path.join(local_filtered_dir, filename)
    cmd = f"docker cp {local_path} namenode:/tmp/{filename} && " \
          f"docker exec namenode hdfs dfs -put /tmp/{filename} {hdfs_target}/ && " \
          f"docker exec namenode rm /tmp/{filename}"
    
    _, t = measure_time(run_command, cmd)
    total_uploaded += 1
    print(f"  âœ… [{total_uploaded}/{len(jsonl_files)}] {filename} â†’ {t:.4f} detik")

print(f"\nðŸŽ‰ Selesai! Total {total_uploaded} file berhasil diunggah ke HDFS.")

Mengunggah file dari dataset/splits-filtered ke HDFS /user/raid/filtered ...
âœ… Ditemukan 75 file .jsonl
  âœ… [1/75] raid-filtered_001.jsonl â†’ 12.4227 detik
  âœ… [2/75] raid-filtered_002.jsonl â†’ 11.4627 detik
  âœ… [3/75] raid-filtered_003.jsonl â†’ 10.5477 detik
  âœ… [4/75] raid-filtered_004.jsonl â†’ 10.8762 detik
  âœ… [5/75] raid-filtered_005.jsonl â†’ 9.9841 detik
  âœ… [6/75] raid-filtered_006.jsonl â†’ 10.2216 detik
  âœ… [7/75] raid-filtered_007.jsonl â†’ 16.4124 detik
  âœ… [8/75] raid-filtered_008.jsonl â†’ 10.4239 detik
  âœ… [9/75] raid-filtered_009.jsonl â†’ 10.2373 detik
  âœ… [10/75] raid-filtered_010.jsonl â†’ 9.9789 detik
  âœ… [11/75] raid-filtered_011.jsonl â†’ 13.0619 detik
  âœ… [12/75] raid-filtered_012.jsonl â†’ 12.4811 detik
  âœ… [13/75] raid-filtered_013.jsonl â†’ 10.8879 detik
  âœ… [14/75] raid-filtered_014.jsonl â†’ 11.1933 detik
  âœ… [15/75] raid-filtered_015.jsonl â†’ 11.5199 detik
  âœ… [16/75] raid-filtered_016.jsonl â†’ 12.3110 detik
  âœ… [17