In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score,
    pairwise_distances_argmin_min
)

from tqdm.notebook import tqdm
from scipy.ndimage import gaussian_filter1d


In [2]:
# ==== 0) Parâmetros ====
ARQ_IN    = "Matrix.csv"        # saída do passo anterior (pivot pronto)
OUT_W     = "W_frames.csv"      # pesos por frame (frames × K)
OUT_H     = "H_padroes.csv"     # padrões (K × células)
OUT_TOP   = "frames_tipicos.csv"
SEED      = 42
MAX_ITER  = 2000

# ==== 1) Ler matriz ====
X = pd.read_csv(ARQ_IN, index_col=0)
X = X.clip(lower=0).astype(float)


In [3]:
print(f"Dimensões da matriz X: {X.shape}")  # (n_frames, n_cells)


Dimensões da matriz X: (1016, 1075)


In [4]:
# ==== 4) Optimização com Optuna (multi-objetivo: KL + Calinski) ====
import optuna
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import calinski_harabasz_score

def objective(trial):
    # pesquisar nº de componentes (fatores latentes)
    n_components = trial.suggest_int("n_components", 2, min(40, X.shape[1]))

    alpha_H = trial.suggest_float("alpha_H", 1e-5, 1e-2, log=True)
    alpha_W = trial.suggest_float("alpha_W", 1e-5, 1e-2, log=True)
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)

    nmf = NMF(
        n_components=n_components,
        init="nndsvda",
        solver="mu",
        beta_loss="kullback-leibler",
        max_iter=MAX_ITER,
        random_state=SEED,
        alpha_H=alpha_H,
        alpha_W=alpha_W,
        l1_ratio=l1_ratio
    )

    W = nmf.fit_transform(X.values)
    H = nmf.components_
    X_hat = W @ H

    # ----------------
    # 1. Divergência KL ↓
    # ----------------
    mask = X.values > 0
    kl = np.sum(
        X.values[mask] * np.log(X.values[mask] / np.maximum(X_hat[mask], 1e-10))
        - X.values[mask] + X_hat[mask]
    )

    # ----------------
    # 2. Calinski-Harabasz ↑
    # ----------------
    scaler = StandardScaler()
    W_scaled = scaler.fit_transform(W)

    kmeans = KMeans(n_clusters=n_components, random_state=SEED)
    labels = kmeans.fit_predict(W_scaled)

    try:
        calinski = calinski_harabasz_score(W_scaled, labels)
    except Exception:
        calinski = 0

    return kl, calinski


# ==== Multi-Objective Study ====
study = optuna.create_study(
    directions=["minimize", "maximize"],  # KL ↓, Calinski ↑
    sampler=optuna.samplers.NSGAIISampler(seed=SEED)
)

study.optimize(objective, n_trials=100)

# ==== Resultados ====
print("Número de soluções no Pareto front:", len(study.best_trials))
print("\n➡️ Soluções ótimas (compromissos KL vs Calinski):")
for t in study.best_trials:
    vals = t.values
    print(f"KL={vals[0]:.2f}, Calinski={vals[1]:.2f}, Params={t.params}")

# ---- Selecionar 1 solução do Pareto front ----
chosen_trial = study.best_trials[0]
best_params = chosen_trial.params
print("\n➡️ Solução mais equilibrada escolhida automaticamente:")
print(f"KL={chosen_trial.values[0]:.2f}, Calinski={chosen_trial.values[1]:.2f}")
print(f"Parâmetros: {best_params}")


[I 2025-10-10 17:25:22,710] A new study created in memory with name: no-name-3abcb4cc-c7a3-4de3-b091-f567ed9b79e6
[I 2025-10-10 17:25:23,995] Trial 0 finished with values: [157641.71067088004, 136.697319114797] and parameters: {'n_components': 16, 'alpha_H': 0.0071144760093434225, 'alpha_W': 0.001570297088405539, 'l1_ratio': 0.5986584841970366}.
[I 2025-10-10 17:25:24,868] Trial 1 finished with values: [178964.86607051388, 357.3095678187624] and parameters: {'n_components': 8, 'alpha_H': 2.9375384576328295e-05, 'alpha_W': 1.493656855461762e-05, 'l1_ratio': 0.8661761457749352}.
[I 2025-10-10 17:25:26,181] Trial 2 finished with values: [137581.08300856443, 69.3927894927399] and parameters: {'n_components': 25, 'alpha_H': 0.001331121608073689, 'alpha_W': 1.1527987128232396e-05, 'l1_ratio': 0.9699098521619943}.
[I 2025-10-10 17:25:27,417] Trial 3 finished with values: [124275.44069970748, 54.42699829067487] and parameters: {'n_components': 34, 'alpha_H': 4.335281794951564e-05, 'alpha_W': 3

Número de soluções no Pareto front: 43

➡️ Soluções ótimas (compromissos KL vs Calinski):
KL=157641.71, Calinski=136.70, Params={'n_components': 16, 'alpha_H': 0.0071144760093434225, 'alpha_W': 0.001570297088405539, 'l1_ratio': 0.5986584841970366}
KL=178964.87, Calinski=357.31, Params={'n_components': 8, 'alpha_H': 2.9375384576328295e-05, 'alpha_W': 1.493656855461762e-05, 'l1_ratio': 0.8661761457749352}
KL=162861.22, Calinski=173.21, Params={'n_components': 13, 'alpha_H': 0.00037520558551242813, 'alpha_W': 0.00019762189340280086, 'l1_ratio': 0.2912291401980419}
KL=137190.12, Calinski=76.79, Params={'n_components': 25, 'alpha_H': 1.3783237455007187e-05, 'alpha_W': 0.0006647135865318024, 'l1_ratio': 0.17052412368729153}
KL=187549.82, Calinski=524.56, Params={'n_components': 6, 'alpha_H': 0.0003058656666978527, 'alpha_W': 1.2681352169084594e-05, 'l1_ratio': 0.9093204020787821}
KL=192338.78, Calinski=708.69, Params={'n_components': 5, 'alpha_H': 3.872118032174584e-05, 'alpha_W': 1.36672729

In [5]:
nmf_best = NMF(
    n_components=best_params["n_components"],
    init="nndsvda",
    solver="mu",
    beta_loss="kullback-leibler",
    max_iter=MAX_ITER,
    random_state=SEED,
    alpha_H=best_params["alpha_H"],
    alpha_W=best_params["alpha_W"],
    l1_ratio=best_params["l1_ratio"]
)

W_best = nmf_best.fit_transform(X.values)
H_best = nmf_best.components_

# Guardar em DataFrames
df_W = pd.DataFrame(W_best, index=X.index, columns=[f"padrao_{i+1}" for i in range(W_best.shape[1])])
df_H = pd.DataFrame(H_best, columns=X.columns, index=[f"padrao_{i+1}" for i in range(H_best.shape[0])])

print(f"✅ df_W e df_H prontos com Optuna (K={best_params['n_components']})")


✅ df_W e df_H prontos com Optuna (K=16)


In [6]:
# ==== Reconstrução final ====
X_hat = W_best @ H_best

# ==== Métricas de erro (NMF final, KL) ====

# Divergência KL
mask = X.values > 0
kl_div = np.sum(
    X.values[mask] * np.log(X.values[mask] / np.maximum(X_hat[mask], 1e-10))
    - X.values[mask] + X_hat[mask]
)

# RMSE e MAE auxiliares
diff = X.values - X_hat
rmse = np.sqrt((diff ** 2).mean())
mae  = np.abs(diff).mean()

print(f"Divergência KL: {kl_div:.6f}")
print(f"RMSE: {rmse:.6f}, MAE: {mae:.6f}")


Divergência KL: 157641.710671
RMSE: 0.761435, MAE: 0.124900


In [7]:
df_W.head()

Unnamed: 0_level_0,padrao_1,padrao_2,padrao_3,padrao_4,padrao_5,padrao_6,padrao_7,padrao_8,padrao_9,padrao_10,padrao_11,padrao_12,padrao_13,padrao_14,padrao_15,padrao_16
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1900-01-01 00:00:00.852,8.795910000000001e-175,0.0,7.816144e-113,3.312757e-105,2.41746,0.0,1.842526e-295,2.031529e-93,8.003952e-111,2.1149850000000002e-156,0.5547209,1.653891e-22,0.0,0.0,0.0,9.030050000000001e-60
1900-01-01 00:00:03.401,2.008556e-118,0.0,1.043267e-65,1.591409e-48,3.0715609999999996e-63,0.0,1.0112270000000001e-114,7.682763e-40,0.8999235,0.2643491,4.876895e-76,0.0,2.4808799999999996e-44,0.0,1.7086600000000001e-103,1.659116
1900-01-01 00:00:05.246,1.765179e-110,5.42712e-160,7.141552e-164,2.0867569999999998e-134,1.303124e-48,3.142231,0.0,1.855168e-58,0.4940969,5.084288e-57,3.468446e-154,4.577991e-16,0.0,0.0,1.813613e-93,1.218121e-78
1900-01-01 00:00:07.974,0.0,0.0,1.647845,1.380885e-72,2.770991e-98,0.0,1.89533e-107,0.0,1.461044e-114,2.536591e-79,2.9116249999999997e-26,1.563063e-13,7.55363e-298,1.764106e-109,9.031715e-95,8.373649e-152
1900-01-01 00:00:08.614,0.6948457,1.020781e-31,0.0,0.3975726,0.0,5.382900999999999e-26,0.3525701,5.852338e-118,1.581834e-101,0.0,1.249196e-55,1.842912e-79,0.3746755,0.0,0.4203163,0.0


In [None]:
df_W.to_csv("nmf_W.csv")
