In [1]:
import numpy as np
import os, re, ast, datetime, json
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# =========================
# CONFIGURATION
# =========================
base_path = "data/"
inputs_txt = "week10/inputs.txt"       # file hasil minggu sebelumnya
outputs_txt = "week10/outputs.txt"
n_candidates = 4000
log_folder = "week11_logs"
os.makedirs(log_folder, exist_ok=True)

# =========================
# LOAD INPUTS (ambil batch terakhir)
# =========================
def load_inputs(path):
    with open(path, "r") as f:
        text = f.read().strip()
    batches = re.split(r"\]\s*\n\s*\[", text)
    last_batch = "[" + batches[-1].strip().lstrip("[").rstrip("]") + "]"
    last_batch = re.sub(r'array\(', '', last_batch).replace(')', '')
    chunks = re.findall(r'\[([^\[\]]+)\]', last_batch)
    return [np.array([float(x) for x in ch.split(",") if x.strip()]) for ch in chunks]

def load_outputs(path):
    with open(path, "r") as f:
        text = f.read().strip()
    batches = re.split(r"\]\s*\n\s*\[", text)
    last_batch = batches[-1].strip()
    last_batch = last_batch.replace("np.float64(", "").replace(")", "")
    last_batch = last_batch.replace("[", "").replace("]", "")
    last_batch = re.sub(r"[^\deE\-\.\,\s]", "", last_batch)
    numbers = re.findall(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", last_batch)
    return np.array([float(x) for x in numbers], dtype=float)

# =========================
# LOAD NEW DATA
# =========================
new_inputs = load_inputs(inputs_txt)
new_outputs = load_outputs(outputs_txt)
print(f"âœ… Parsed {len(new_inputs)} input vectors, {len(new_outputs)} outputs.")

# =========================
# TRAIN SURROGATE & CLUSTER-AWARE QUERYING
# =========================
queries_out = []
cluster_info = {}

for i in range(1, 9):
    folder = os.path.join(base_path, f"function_{i}")
    X_prev = np.load(os.path.join(folder, "week10_inputs.npy"))
    y_prev = np.load(os.path.join(folder, "week10_outputs.npy"))

    # Gabungkan data lama + batch baru
    X_combined = np.vstack([X_prev, new_inputs[i-1].reshape(1, -1)])
    y_combined = np.append(y_prev, new_outputs[i-1])

    np.save(os.path.join(folder, "week11_inputs.npy"), X_combined)
    np.save(os.path.join(folder, "week11_outputs.npy"), y_combined)

    dim = X_combined.shape[1]
    print(f"\n=== Function {i} ({dim}D) ===")
    print(f"  Data size: {len(X_combined)}, Output range: [{y_combined.min():.4f}, {y_combined.max():.4f}]")

    # Build surrogate
    model = make_pipeline(
        StandardScaler(),
        MLPRegressor(hidden_layer_sizes=(512, 256, 128),
                     activation='relu',
                     solver='adam',
                     alpha=3e-4,
                     learning_rate_init=0.001,
                     max_iter=3000,
                     random_state=42)
    )
    model.fit(X_combined, y_combined)

    # Predict on candidate grid
    candidates = np.random.uniform(0, 1, (n_candidates, dim))
    preds = model.predict(candidates)

    # --- CLUSTER ANALYSIS ---
    k_opt = min(4, len(X_combined)//2)  # avoid too many clusters
    kmeans = KMeans(n_clusters=k_opt, n_init=10, random_state=42)
    labels = kmeans.fit_predict(X_combined)
    sil_score = silhouette_score(X_combined, labels) if len(np.unique(labels)) > 1 else 0
    cluster_means = [np.mean(y_combined[labels == k]) for k in range(k_opt)]
    best_cluster = np.argmax(cluster_means)
    centroid = kmeans.cluster_centers_[best_cluster]
    cluster_pred = model.predict(centroid.reshape(1, -1))[0]

    # Combine local exploitation (cluster) + global exploration
    best_idx = np.argmax(preds)
    best_query = np.clip(
        0.7 * centroid + 0.3 * candidates[best_idx], 0.0, 1.0
    )
    query_str = "-".join([f"{x:.6f}" for x in best_query])

    print(f"  Best predicted output: {preds[best_idx]:.4f}")
    print(f"  Cluster-based centroid output: {cluster_pred:.4f} (silhouette={sil_score:.3f})")
    print(f"  Query to submit: {query_str}")

    # --- PLOT & LOG ---
    plt.figure(figsize=(5, 5))
    plt.scatter(y_combined, model.predict(X_combined), c=labels, cmap="tab10", edgecolor='k')
    plt.plot([y_combined.min(), y_combined.max()],
             [y_combined.min(), y_combined.max()], 'r--', lw=2)
    plt.title(f"Function {i} - Actual vs Predicted (Week 11)")
    plt.xlabel("Actual y"); plt.ylabel("Predicted y")
    plt.savefig(os.path.join(log_folder, f"function_{i}_fit.png"))
    plt.close()

    cluster_info[f"Function_{i}"] = {
        "data_points": len(X_combined),
        "clusters": k_opt,
        "silhouette": sil_score,
        "cluster_means": cluster_means,
        "best_cluster": int(best_cluster),
        "centroid": centroid.tolist(),
        "centroid_predicted_output": float(cluster_pred),
        "selected_query": best_query.tolist()
    }

    queries_out.append(f"Function {i}: {query_str}")

# =========================
# SAVE RESULTS
# =========================
with open("week11_queries.txt", "w") as f:
    f.write("\n".join(queries_out))

with open(os.path.join(log_folder, "week11_cluster_info.json"), "w") as f:
    json.dump(cluster_info, f, indent=4)

print("\nðŸ’¾ Saved all week11 queries to week11_queries.txt")
print("ðŸ“Š Cluster analysis logs saved under week11_logs/")

âœ… Parsed 8 input vectors, 8 outputs.

=== Function 1 (2D) ===
  Data size: 20, Output range: [-0.0036, 64.0000]
  Best predicted output: 15.7323
  Cluster-based centroid output: 10.9567 (silhouette=0.620)
  Query to submit: 0.140392-0.564707

=== Function 2 (2D) ===
  Data size: 20, Output range: [-0.0656, 3.1124]
  Best predicted output: 0.7477
  Cluster-based centroid output: 0.7155 (silhouette=0.582)
  Query to submit: 0.748738-0.731142

=== Function 3 (3D) ===
  Data size: 25, Output range: [-0.3989, 71.0000]
  Best predicted output: 24.0944
  Cluster-based centroid output: 14.8215 (silhouette=0.401)
  Query to submit: 0.255475-0.643362-0.830987

=== Function 4 (4D) ===
  Data size: 40, Output range: [-32.6257, 64.0000]
  Best predicted output: 3.3990
  Cluster-based centroid output: -3.1873 (silhouette=0.304)
  Query to submit: 0.217797-0.347217-0.281739-0.367782

=== Function 5 (4D) ===
  Data size: 30, Output range: [0.1129, 4440.5227]
  Best predicted output: 3215.9968
  Clus