In [None]:
import arviz as az
import numpy as np
import pandas as pd
import pymc as pm
import pytensor.tensor as pt
import re

import time

from pathlib import Path
from scipy.stats import norm
from scipy.stats import gamma
from scipy.stats import beta
from scipy.stats import truncnorm

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# Generación

### Funciones auxiliares

In [None]:
def parametros(dist, *args, shift=0, inv=0, lim_inf=None, lim_sup=None):

    params = {"dist": dist}
    for i, value in enumerate(args, start=1):
        params[f"param{i}"] = value
    params["lim_inf"] = lim_inf
    params["lim_sup"] = lim_sup

    return params

In [None]:
def dist_muestras(paramsY, var='Y', n_muestras=1000, random_seed = None):

    dist = paramsY.get('dist')

    rng = np.random.default_rng(seed=random_seed)
    if dist == 'uniforme':
      data = rng.uniform(low=paramsY.get('param1'), high=paramsY.get('param2'), size=n_muestras)
    elif dist == 'normal':
      data = rng.normal(loc=paramsY.get('param1'), scale=paramsY.get('param2'), size=n_muestras)
    elif dist == 'halfnormal':
      data = np.abs(rng.normal(loc=paramsY.get('param1'), scale=paramsY.get('param2'), size=n_muestras))
    elif dist == 'gamma':
      data = rng.gamma(shape=paramsY.get('param1'), scale=paramsY.get('param2'), size=n_muestras)
    elif dist == 'beta':
      data = rng.beta(a=paramsY.get('param1'), b=paramsY.get('param2'), size=n_muestras)
    else:
        raise ValueError(f"Distribución '{dist}' no contemplada.")

    return pd.DataFrame({var: data})

In [None]:
def muestreo(paramsY, var='Y', n_muestras=1000, random_seed = None):

  df = dist_muestras(paramsY, var, n_muestras, random_seed)

  a = paramsY.get('lim_inf')
  b = paramsY.get('lim_sup')
  if a is None:
    a = df[var].min()
  if b is None:
    b = df[var].max()

  if a <= b:
    while (df[var] < a).any() or (df[var] > b).any():
      n_aux = len(df[(df[var] < a) | (df[var] > b)])
      df_aux = dist_muestras(paramsY, var, n_aux, random_seed)
      df = pd.concat([df[(df[var] >= a) & (df[var] <= b)], df_aux], ignore_index=True)
  else:
    while ((df[var] < b) & (df[var] > a)).any():
      n_aux = ((df[var] > a) & (df[var] < b)).sum()
      df_aux = dist_muestras(paramsY, var, n_aux, random_seed)
      df = pd.concat([df[(df[var] >= a) & (df[var] <= b)], df_aux], ignore_index=True)

  return df

### Selección de muestras registradas

In [None]:
# modificar para incluir límites
def R_mid_Y(data, paramsR, var='Y', random_seed = None) -> np.ndarray:

    dist = paramsR.get('dist')
    rng = np.random.default_rng(seed=random_seed)
    x = data[var].values

    if dist == 'uniforme':
        a = paramsR.get('param1', x.min())
        b = paramsR.get('param2', x.max())
        p = paramsR.get('param3', 1.0)
        probs = np.where((x >= a) & (x <= b), p, 0.0)

    elif dist == 'normal':
        loc = paramsR.get('param1', np.mean(x))
        scale = paramsR.get('param2', np.std(x) + 1e-9)
        a = paramsR.get('lim_inf', None)
        b = paramsR.get('lim_sup', None)
        if a is None and b is None:
          probs = norm.pdf(x, loc=loc, scale=scale)
          probs = probs / (probs.max() + 1e-9)
        else:
          if a is None:
            a = x.min()
          if b is None:
            b = x.max()
          probs = truncnorm.pdf(x, a=(a-loc)/scale, b=(b-loc)/scale)

    elif dist == 'halfnormal':
        scale = paramsR.get('param1', np.std(x) + 1e-9)
        probs = norm.pdf(x, loc=0, scale=scale)
        probs = probs / (probs.max() + 1e-9)
        probs[x < 0] = 0

    elif dist == 'beta':
        a = paramsR.get('param1', 2)
        b = paramsR.get('param2', 5)
        x_norm = (x - x.min()) / (x.max() - x.min() + 1e-9)
        probs = beta.pdf(x_norm, a, b)
        probs = probs / (probs.max() + 1e-9)

    elif dist == 'gamma':
        shape = paramsR.get('param1', 2.0)   # k
        scale = paramsR.get('param2', 1.0)   # θ
        probs = gamma.pdf(x, a=shape, scale=scale)
        probs = probs / (probs.max() + 1e-9)

    else:
        raise ValueError(f"Distribución '{dist}' no contemplada.")

    return probs

In [None]:
# Muestras observadas
def datos_seleccion(dist_original, dist_registro, var='Y', n_muestras=1000, random_seed=None):

    # Generar datos originales Y
    datos_originales = muestreo(dist_original, var, n_muestras, random_seed)
    # Generar probabilidad R condicionada por Y
    func_registro = R_mid_Y(datos_originales, dist_registro)
    # Generar patrón de registro
    rng = np.random.default_rng(seed=random_seed)
    indicatriz = rng.binomial(n=1, p=func_registro)

    indices_registrados = (indicatriz == 1)
    datos_completos = pd.DataFrame({var: datos_originales[var],"R": indicatriz})

    return datos_completos

### Mezcla de muestras registradas y no registradas

In [None]:
def datos_mezcla(ProbR, var, dist_R, dist_noR, n_muestras=1000, random_seed=None):

    rng = np.random.default_rng(seed=random_seed)
    indicatriz = rng.binomial(n=1, p=ProbR, size=n_muestras)

    n_R = (indicatriz == 1).sum()
    n_noR = (indicatriz == 0).sum()

    dist_YR = muestreo(dist_R, var, n_R, random_seed)[var].values
    dist_YnoR = muestreo(dist_noR, var, n_noR, random_seed)[var].values

    datos_originales = []
    i_R, i_noR = 0, 0
    for r in indicatriz:
        if r == 1:
            datos_originales.append(dist_YR[i_R])
            i_R += 1
        else:
            datos_originales.append(dist_YnoR[i_noR])
            i_noR += 1

    indices_registrados = (indicatriz == 1)
    datos_completos = pd.DataFrame({var: datos_originales, "R": indicatriz})

    return datos_completos

### Ejecución con datos de .csv

In [None]:
def genera_datos(input_path=None, input_dict=None, df=None, pct=0.15, n_muestras=1000, indicador_R=0):

    if df is None:
      if input_path is None:
        df = pd.DataFrame([input_dict])
      else:
        df = pd.read_csv(input_path)
        input_path = Path(input_path)
        output_dir = input_path.parent / 'Datos_generados'
        output_dir.mkdir(parents=True, exist_ok=True)
        observed_dir = input_path.parent / 'Datos_observados'
        observed_dir.mkdir(parents=True, exist_ok=True)

    for _, row in df.iterrows():

        sim_id = row['sim_id']
        method = row['method'].lower()

        dist_1 = parametros(row['dist_1'],row['dist_1_param_1'],row['dist_1_param_2'],row['lim_inf'],row['lim_sup'])
        dist_2 = parametros(row['dist_2'],row['dist_2_param_1'],row['dist_2_param_2'],row['lim_inf'],row['lim_sup'])

        if method == 'mezcla':
            P_R = row['P_R']
            YR = datos_mezcla(P_R, 'Y', dist_1, dist_2, n_muestras=n_muestras)

        elif method == 'seleccion':
            YR = datos_seleccion(dist_1, dist_2, n_muestras=n_muestras)

        if input_path is None and indicador_R==0:
          return YR

        elif input_path is None and indicador_R==1:
          return YR[['Y','R']]

        else:
          for col, value in row.items():
              YR[col] = value

          output_path = output_dir / f"{sim_id}.csv"
          YR.to_csv(output_path, index=False)

          YR_seg = YR.copy()
          idx_R = YR_seg.columns.get_loc("R")
          YR_seg.insert((idx_R) + 1, "S", YR_seg["R"])
          idx_aus = YR_seg.index[YR_seg["S"] == 0]
          len_aus = len(idx_aus)
          idx_flip = np.random.choice(idx_aus, size=int(pct*len_aus), replace=False)
          YR_seg.loc[idx_flip, "S"] = 1
          seguimiento_path = output_dir / f"{sim_id}_seg.csv"
          YR_seg.to_csv(seguimiento_path, index=False)

          YR_obs = YR.loc[YR["R"] == 1, "Y"]
          YR_obs.to_csv(observed_dir / f"{sim_id}.csv", index=False)

          YR_obs_seg = YR_seg.loc[(YR_seg["R"] == 0) & (YR_seg["S"] == 1), "Y"]
          YR_obs_seg.to_csv(observed_dir / f"{sim_id}_seg.csv", index=False)

        return YR[['Y','R']]

# Modelado



## Modelado de la distribución conjunta $(Y,R)$

In [None]:
def prior_theta(nombre, prior):

    dist = prior.get('dist')

    # Prior no informativa
    if dist == "flat":
        return pm.Flat(nombre)

    # Si puedo acotar theta
    elif dist == "uniforme":
        lower = prior.get('lower', prior.get('param1'))
        upper = prior.get('upper', prior.get('param2'))
        if lower is None or upper is None:
            raise ValueError(f"Uniform prior '{nombre}' requires 'lower' and 'upper'")
        return pm.Uniform(nombre, lower=lower, upper=upper)

    # Si es necesariamente positiva
    elif dist == "halfnormal":
        sigma = prior.get('sigma', prior.get('param1'))
        if sigma is None:
            raise ValueError(f"HalfNormal prior '{nombre}' requires 'sigma'")
        return pm.HalfNormal(nombre, sigma=sigma)

    # Si tengo una idea aproximada de cuál es el valor esperado
    elif dist == "normal":
        mu = prior.get('mu', prior.get('param1'))
        sigma = prior.get('sigma', prior.get('param2'))
        if mu is None or sigma is None:
            raise ValueError(f"Normal prior '{nombre}' requires 'mu' and 'sigma'")
        return pm.Normal(nombre, mu=mu, sigma=sigma)
    else:
        raise ValueError(f"Distribución a priori no reconocida: {dist}")

### Modelo de mezcla de patrones

In [None]:
def Dist_Y(valores, var, hipY, theta1=None, theta2=None, lim_inf=None, lim_sup=None):

  if hipY == 'uniforme':
    return pm.Uniform(var, lower=theta1, upper=theta2, observed=valores)
  elif hipY == 'normal':
    return pm.Normal(var, mu=theta1, sigma=theta2, observed=valores)
  elif hipY == 'TruncNormal':
    return pm.TruncatedNormal(var, mu=theta1,sigma=theta2, lower=lim_inf, upper=lim_sup, observed=valores)
  elif hipY == 'HalfNormal':
    return pm.HalfNormal(var, sigma=theta1, observed=valores)
  elif hipY == 'gamma':
    return pm.Gamma(var, alpha=theta1, beta=1/theta2, observed=valores)
  elif hipY == 'beta':
    return pm.Beta(var, alpha=theta1, beta=theta2, observed=valores)
  else:
    raise ValueError(f"Distribución {hipY} no contemplada")

In [None]:
def EstimaMezcla(df, hipYobs, hipYaus, dicPriors, dicPosteriors=None,
                 var='Y', lim_inf=None, lim_sup=None, seguimiento=0, df_seg=None, show=0):

    if seguimiento == 0:
        priors_iter = list(dicPriors.items())[:2]
    else:
        priors_iter = list(dicPriors.items())

    with pm.Model() as modelo:

        theta = []
        theta_dict = {}

        for i, (nombre, prior) in enumerate(priors_iter):
            if isinstance(prior, (int, float, np.number)):
                theta_i = prior  # Treat as constant
            elif dicPosteriors and nombre in dicPosteriors:
                muestras_aux = dicPosteriors[nombre].values.reshape(-1)
                muestras_i = pm.math.constant(muestras_aux)
                aux_i = pm.Categorical(f'{nombre}_aux', p=np.ones(len(muestras_aux)) / len(muestras_aux))
                theta_i = pm.Deterministic(nombre, muestras_i[aux_i])
            else:
                theta_i = prior_theta(nombre, prior)

            theta.append(theta_i)
            theta_dict[nombre] = theta_i


        # Determine observed data based on seguimiento
        if seguimiento == 0:
            registro = df[var].values
            verosimilitud = Dist_Y(registro, var , hipYobs, theta1=theta[0], theta2=theta[1], lim_inf=lim_inf, lim_sup=lim_sup)

        elif seguimiento == 1:
            registro_obs = df[var].values
            verosimilitud_obs = Dist_Y(registro_obs, 'Y_obs' , hipYobs, theta1=theta[0], theta2=theta[1], lim_inf=lim_inf, lim_sup=lim_sup)

            registro_aus = df_seg[var].values
            verosimilitud_aus = Dist_Y(registro_aus, 'Y_aus' , hipYaus, theta1=theta[2], theta2=theta[3], lim_inf=lim_inf, lim_sup=lim_sup)

        # Sample posterior
        muestras_posterior = pm.sample(init="adapt_diag", progressbar=True, return_inferencedata=True, tune=1000, draws = 1000)

    return muestras_posterior

### Modelo de selección

In [None]:
# Hipótesis sobre P(Y)
def logDist_Y(Y, hipY, param_1, param_2):
  if hipY == 'uniforme':
    logY = pm.logp(pm.Uniform.dist(lower=param_1, upper=param_2), Y)
  elif hipY == 'normal':
    logY = pm.logp(pm.Normal.dist(mu=param_1, sigma=param_2), Y)
  elif hipY == 'gamma':
    logY = pm.logp(pm.Gamma.dist(alpha=param_1, beta=1/param_2), Y)
  elif hipY == 'beta':
    logY = pm.logp(pm.Beta.dist(alpha=param_1, beta=param_2), Y)
  else: raise ValueError(f"Unknown hipY: {hipY}")
  return logY

# Hipótesis sobre P(R|Y)
def logDist_RmidY(R, Y, hipR, param_1, param_2):
  if hipR == "normal":
    dist = pm.Normal.dist(mu=param_1, sigma=param_2)
  elif hipR == "beta":
    dist = pm.Beta.dist(alpha=param_1, beta=param_2)
  elif hipR == "gamma":
    dist = pm.Gamma.dist(alpha=param_1, beta=1/param_2)
  else:
    raise ValueError(f"Unknown hipR: {hipR}")
  p = pm.logp(dist, Y)
  R_tensor = pt.as_tensor_variable(R)
  prob = pt.exp(p)
  prob = pt.clip(prob, 1e-9, 1 - 1e-9)
  return pt.switch(pt.eq(R_tensor, 1), pt.log(prob), pt.log1p(-prob))

In [None]:
def EstimaSeleccion(df, hipY, hipR, dicPriors, dicPosteriors=None, var='Y', seguimiento=0, df_seg=None, show=0):

    dicPosteriors = {} if dicPosteriors is None else dicPosteriors

    with pm.Model() as modelo:

        theta = []
        theta_dict = {}

        for i, (nombre, prior) in enumerate(dicPriors.items()):
            if isinstance(prior, (int, float, np.number)):
                theta_i = prior
            elif dicPosteriors and nombre in dicPosteriors:
                muestras_aux = dicPosteriors[nombre].values.reshape(-1)
                muestras_i = pm.math.constant(muestras_aux)
                aux_i = pm.Categorical(f'{nombre}_aux', p=np.ones(len(muestras_aux)) / len(muestras_aux))
                theta_i = pm.Deterministic(nombre, muestras_i[aux_i])
            else:
                theta_i = prior_theta(nombre, prior)

            theta.append(theta_i)
            theta_dict[nombre] = theta_i

        if seguimiento == 0:
            registro = df[var].values
            R_val = 1
            logY = logDist_Y(registro, hipY, theta[0], theta[1])
            logRmidY = logDist_RmidY(R_val, registro, hipR, theta[2], theta[3])
        else:
            registro_obs = df[var].values
            registro_aus = df_seg[var].values
            registro_total = pd.concat([df[var], df_seg[var]], ignore_index=True).values

            logY = logDist_Y(registro_total, hipY, theta[0], theta[1])
            logRmidY = (
    pm.math.sum(logDist_RmidY(1, registro_obs, hipR, theta[2], theta[3])) +
    pm.math.sum(logDist_RmidY(0, registro_aus, hipR, theta[2], theta[3])))

        # Se suma como Potential
        pm.Potential("likelihood", logY + logRmidY)

        #Sampleo
        muestras_posterior = pm.sample(init="adapt_diag", progressbar=True, return_inferencedata=True, tune=1000, draws = 1000)

    return muestras_posterior

## Definición de hipótesis / priors

In [None]:
def get_prior(row):
    dist = row.prior_dist
    p1 = None if pd.isna(row.param1) else float(row.param1)
    p2 = None if pd.isna(row.param2) else float(row.param2)
    if dist == "const":
        return p1
    if dist == "flat":
        return parametros("flat")
    if p2 is None:
        return parametros(dist, p1)
    else:
      return parametros(dist,p1, p2)

In [None]:
def get_hypothesis(sim, method):
    modelo = sim[sim.method == method]
    assumed_dists = []
    chosen_priors = []
    lims_list = []
    for hid, hdf in modelo.groupby("hyp_id"):
        dist_1 = hdf.dist_1.iloc[0]
        dist_2 = hdf.dist_2.iloc[0]
        assumed_dists.append((dist_1, dist_2))

        lim_inf = hdf.lim_inf.iloc[0]
        lim_sup = hdf.lim_sup.iloc[0]
        lims_list.append([lim_inf, lim_sup])

        priors_for_h = []
        for pid, pdf in hdf.groupby("prior_id"):
            priors = {'sim_id': pdf.sim_id.iloc[0]}
            for _, row in pdf.iterrows():
                priors[row.theta] = get_prior(row)
            priors_for_h.append(priors)
        chosen_priors.append(priors_for_h)
    return assumed_dists, chosen_priors, lims_list

## Ejecución modelos

### Modelo base

In [None]:
def Estima_thetas(dir_sim, label, idx_iter=None):

    modelos = []

    hyp_path = pd.read_csv(dir_sim / f"Hip_{label}.csv")
    assumed_dists_m, chosen_priors_m, lims_m = get_hypothesis(hyp_path, "Mezcla")
    assumed_dists_s, chosen_priors_s, lims_s = get_hypothesis(hyp_path, "Seleccion")

    datos_dir = dir_sim / f"Datos_observados"
    datos_dir.mkdir(parents=True, exist_ok=True)

    if idx_iter is None:
      thetas_dir = dir_sim / f"Estimaciones_theta"
    else:
      thetas_dir = dir_sim / f"Ejecuciones/Estimaciones_theta"
    thetas_dir.mkdir(parents=True, exist_ok=True)

    ind_iter = f"_{idx_iter+1}" if idx_iter is not None else ""

    for path in datos_dir.glob("*.csv"):
        if path.name.endswith("seg.csv"):
          continue

        df = pd.read_csv(path)
        name = path.stem

        for ((hipYobs, hipYaus), priors_list, lims) in zip(assumed_dists_m, chosen_priors_m, lims_m):
            for dicPriors in priors_list:

                dicPriors_local = dicPriors.copy()
                sim_id = dicPriors_local.pop("sim_id")

                tic = time.time()
                muestras_posterior = EstimaMezcla(df, hipYobs, hipYaus, dicPriors_local)
                tac = time.time()

                posterior_df = az.extract(muestras_posterior, group="posterior").to_dataframe()

                theta_0 = posterior_df["theta_0"].mean()
                theta_1 = posterior_df["theta_1"].mean()
                theta_2 = dicPriors_local["theta_2"]
                theta_3 = dicPriors_local["theta_3"]
                lim_inf = lims[0]
                lim_sup = lims[1]

                modelo_local = {"sim_id": f'{name}_{sim_id}',
                                "method": "Mezcla",
                                "P_R": 0.5,
                                "dist_1": hipYobs,
                                "dist_1_param_1": theta_0,
                                "dist_1_param_2": theta_1,
                                "dist_2": hipYaus,
                                "dist_2_param_1": theta_2,
                                "dist_2_param_2": theta_3,
                                "lim_inf": lim_inf,
                                "lim_sup": lim_sup,
                                "time": tac - tic}
                modelos.append(modelo_local)

        for ((dist_Y, dist_RmidY), priors_list, lims) in zip(assumed_dists_s, chosen_priors_s, lims_s):
            for dicPriors in priors_list:

                dicPriors_local = dicPriors.copy()
                sim_id = dicPriors_local.pop("sim_id")

                tic = time.time()
                muestras_posterior = EstimaSeleccion(df, dist_Y, dist_RmidY, dicPriors_local)
                tac = time.time()

                posterior_df = az.extract(muestras_posterior, group="posterior").to_dataframe()

                theta_0 = posterior_df["theta_0"].mean()
                theta_1 = posterior_df["theta_1"].mean()
                theta_2 = posterior_df["theta_2"].mean()
                theta_3 = posterior_df["theta_3"].mean()
                lim_inf = lims[0]
                lim_sup = lims[1]

                modelo_local = {"sim_id": f'{name}_{sim_id}',
                                "method": "Seleccion",
                                "P_R": 0.5,
                                "dist_1": dist_Y,
                                "dist_1_param_1": theta_0,
                                "dist_1_param_2": theta_1,
                                "dist_2": dist_RmidY,
                                "dist_2_param_1": theta_2,
                                "dist_2_param_2": theta_3,
                                "lim_inf": lim_inf,
                                "lim_sup": lim_sup,
                                "time": tac - tic}
                modelos.append(modelo_local)

    modelos_df = pd.DataFrame(modelos)
    modelos_df.to_csv(thetas_dir/f"Estim_{label}{ind_iter}.csv", index=False)

    return modelos_df


### Seguimiento

In [None]:
def Estima_thetas_seguimiento(dir_sim, label, idx_iter=None, pct=0.15):

    modelos = []

    def get_theta_value(posterior_df, dicPriors_local, theta_name):
        if theta_name in posterior_df.columns:
            return posterior_df[theta_name].mean()

        prior_value = dicPriors_local.get(theta_name)
        if isinstance(prior_value, (int, float, np.number)):
            return float(prior_value)
        if isinstance(prior_value, dict) and "param1" in prior_value:
            return float(prior_value["param1"])

        return np.nan

    def normalize_pct(raw_pct, default_pct):
        if raw_pct is None or raw_pct == "":
            value = float(default_pct)
        else:
            value = float(raw_pct)
            if value > 1:
                value = value / 100
        return f"{value:.2f}".rstrip("0").rstrip(".")

    hyp_path = pd.read_csv(dir_sim / f"Hip_{label}.csv")
    assumed_dists_m, chosen_priors_m, lims_m = get_hypothesis(hyp_path, "Mezcla")
    assumed_dists_s, chosen_priors_s, lims_s = get_hypothesis(hyp_path, "Seleccion")

    datos_dir = dir_sim / f"Datos_observados"

    if idx_iter is None:
      thetas_dir = dir_sim / f"Estimaciones_theta_seg"
    else:
      thetas_dir = dir_sim / f"Ejecuciones/Estimaciones_theta_seg"
    thetas_dir.mkdir(parents=True, exist_ok=True)

    ind_iter = f"_{idx_iter+1}" if idx_iter is not None else ""

    for path in datos_dir.glob("*.csv"):
        if path.name.endswith("seg.csv"):
          continue

        df = pd.read_csv(path)

        name = path.stem
        for seg_path in datos_dir.glob(f"*{name}_*seg.csv"):

          pct_match = re.search(r"_(.*?)seg$", Path(seg_path).stem)
          pct_raw = pct_match.group(1) if pct_match else ""
          pct_str = normalize_pct(pct_raw, pct)
          df_seg = pd.read_csv(seg_path)

          for ((hipYobs, hipYaus), priors_list, lims) in zip(assumed_dists_m, chosen_priors_m, lims_m):
              for dicPriors in priors_list:

                  dicPriors_local = dicPriors.copy()
                  sim_id = dicPriors_local.pop("sim_id")

                  tic = time.time()
                  muestras_posterior = EstimaMezcla(df, hipYobs, hipYaus, dicPriors_local, seguimiento=1, df_seg=df_seg)
                  tac = time.time()

                  posterior_df = az.extract(muestras_posterior, group="posterior").to_dataframe()

                  theta_0 = get_theta_value(posterior_df, dicPriors_local, "theta_0")
                  theta_1 = get_theta_value(posterior_df, dicPriors_local, "theta_1")
                  theta_2 = get_theta_value(posterior_df, dicPriors_local, "theta_2")
                  theta_3 = get_theta_value(posterior_df, dicPriors_local, "theta_3")
                  lim_inf = lims[0]
                  lim_sup = lims[1]

                  modelo_local = {"sim_id": f'{name}_{sim_id}_{pct_str}_seg',
                                  "method": "Mezcla",
                                  "P_R": 0.5,
                                  "dist_1": hipYobs,
                                  "dist_1_param_1": theta_0,
                                  "dist_1_param_2": theta_1,
                                  "dist_2": hipYaus,
                                  "dist_2_param_1": theta_2,
                                  "dist_2_param_2": theta_3,
                                  "lim_inf": lim_inf,
                                  "lim_sup": lim_sup,
                                  "time": tac - tic}

                  modelos.append(modelo_local)

          for ((dist_Y, dist_RmidY), priors_list, lims) in zip(assumed_dists_s, chosen_priors_s, lims_s):
              for dicPriors in priors_list:

                  dicPriors_local = dicPriors.copy()
                  sim_id = dicPriors_local.pop("sim_id")

                  tic = time.time()
                  muestras_posterior = EstimaSeleccion(df, dist_Y, dist_RmidY, dicPriors_local, seguimiento=1, df_seg=df_seg)
                  tac = time.time()

                  posterior_df = az.extract(muestras_posterior, group="posterior").to_dataframe()

                  theta_0 = get_theta_value(posterior_df, dicPriors_local, "theta_0")
                  theta_1 = get_theta_value(posterior_df, dicPriors_local, "theta_1")
                  theta_2 = get_theta_value(posterior_df, dicPriors_local, "theta_2")
                  theta_3 = get_theta_value(posterior_df, dicPriors_local, "theta_3")
                  lim_inf = lims[0]
                  lim_sup = lims[1]

                  modelo_local = {"sim_id": f'{name}_{sim_id}_{pct_str}_seg',
                                  "method": "Seleccion",
                                  "P_R": 0.5,
                                  "dist_1": dist_Y,
                                  "dist_1_param_1": theta_0,
                                  "dist_1_param_2": theta_1,
                                  "dist_2": dist_RmidY,
                                  "dist_2_param_1": theta_2,
                                  "dist_2_param_2": theta_3,
                                  "lim_inf": lim_inf,
                                  "lim_sup": lim_sup,
                                  "time": tac - tic}
                  modelos.append(modelo_local)

    modelos_df = pd.DataFrame(modelos)
    modelos_df.to_csv(thetas_dir/f"Estim_{label}_seg{ind_iter}.csv", index=False)

    return modelos_df

### Actualización priors

In [None]:
def Estima_thetas_updatepriors(dir_sim, label, n_iter=1, idx_iter=None, seguimiento=0, df_seg=None):

    modelos = []

    def get_theta_value(posterior_df, dicPriors_local, theta_name):
        if theta_name in posterior_df.columns:
            return posterior_df[theta_name].mean()

        prior_value = dicPriors_local.get(theta_name)
        if isinstance(prior_value, (int, float, np.number)):
            return float(prior_value)
        if isinstance(prior_value, dict) and "param1" in prior_value:
            return float(prior_value["param1"])

        return np.nan

    hyp_path = pd.read_csv(dir_sim / f"Hip_{label}.csv")
    assumed_dists_m, chosen_priors_m, lims_m = get_hypothesis(hyp_path, "Mezcla")
    assumed_dists_s, chosen_priors_s, lims_s = get_hypothesis(hyp_path, "Seleccion")

    datos_dir = dir_sim / f"Datos_observados"
    datos_dir.mkdir(parents=True, exist_ok=True)

    ind_seg = "_seg" if seguimiento == 1 else ""

    if idx_iter is None:
      thetas_dir = dir_sim / f"Estimaciones_theta_updatepriors{ind_seg}"
    else:
      thetas_dir = dir_sim / f"Ejecuciones/Estimaciones_theta_updatepriors{ind_seg}"
    thetas_dir.mkdir(parents=True, exist_ok=True)

    ind_iter = f"_{idx_iter+1}" if idx_iter is not None else ""

    for path in datos_dir.glob("*.csv"):
        if path.name.endswith("seg.csv"):
          continue

        df = pd.read_csv(path)
        name = path.stem

        if seguimiento == 1:
          seg_candidates = list(datos_dir.glob(f"{name}_*seg.csv"))
          if not seg_candidates:
            raise FileNotFoundError(f"No se encontro seguimiento para {name} en {datos_dir}")
          df_seg_local = pd.read_csv(seg_candidates[0])
        else:
          df_seg_local = df_seg

        for ((hipYobs, hipYaus), priors_list, lims) in zip(assumed_dists_m, chosen_priors_m, lims_m):
            for dicPriors in priors_list:

                dicPriors_local = dicPriors.copy()
                sim_id = dicPriors_local.pop("sim_id")

                muestras_posterior = None

                for iter_idx in range(n_iter):
                    tic = time.time()
                    if iter_idx == 0:
                        muestras_posterior = EstimaMezcla(df, hipYobs, hipYaus, dicPriors_local, seguimiento=seguimiento, df_seg=df_seg_local)
                    else:
                        muestras_posterior = EstimaMezcla(df, hipYobs, hipYaus, dicPriors_local, muestras_posterior, seguimiento=seguimiento, df_seg=df_seg_local)
                    tac = time.time()

                    posterior_df = az.extract(muestras_posterior, group="posterior").to_dataframe()

                    theta_0 = get_theta_value(posterior_df, dicPriors_local, "theta_0")
                    theta_1 = get_theta_value(posterior_df, dicPriors_local, "theta_1")
                    theta_2 = get_theta_value(posterior_df, dicPriors_local, "theta_2")
                    theta_3 = get_theta_value(posterior_df, dicPriors_local, "theta_3")
                    lim_inf = lims[0]
                    lim_sup = lims[1]

                    modelo_local = {"sim_id": f"{name}_{sim_id}_{iter_idx}",
                                    "method": "Mezcla",
                                    "P_R": 0.5,
                                    "dist_1": hipYobs,
                                    "dist_1_param_1": theta_0,
                                    "dist_1_param_2": theta_1,
                                    "dist_2": hipYaus,
                                    "dist_2_param_1": theta_2,
                                    "dist_2_param_2": theta_3,
                                    "lim_inf": lim_inf,
                                    "lim_sup": lim_sup,
                                    "time": tac - tic}

                    modelos.append(modelo_local)

        for ((dist_Y, dist_RmidY), priors_list, lims) in zip(assumed_dists_s, chosen_priors_s, lims_s):
            for dicPriors in priors_list:

                dicPriors_local = dicPriors.copy()
                sim_id = dicPriors_local.pop("sim_id")

                muestras_posterior = None
                for iter_idx in range(n_iter):

                    tic = time.time()
                    if iter_idx == 0:
                        muestras_posterior = EstimaSeleccion(df, dist_Y, dist_RmidY, dicPriors_local, seguimiento=seguimiento, df_seg=df_seg_local)
                    else:
                        muestras_posterior = EstimaSeleccion(df, dist_Y, dist_RmidY, dicPriors_local, muestras_posterior, seguimiento=seguimiento, df_seg=df_seg_local)
                    tac = time.time()

                    posterior_df = az.extract(muestras_posterior, group="posterior").to_dataframe()

                    theta_0 = get_theta_value(posterior_df, dicPriors_local, "theta_0")
                    theta_1 = get_theta_value(posterior_df, dicPriors_local, "theta_1")
                    theta_2 = get_theta_value(posterior_df, dicPriors_local, "theta_2")
                    theta_3 = get_theta_value(posterior_df, dicPriors_local, "theta_3")
                    lim_inf = lims[0]
                    lim_sup = lims[1]

                    modelo_local = {"sim_id": f"{name}_{sim_id}_{iter_idx}",
                                    "method": "Seleccion",
                                    "P_R": 0.5,
                                    "dist_1": dist_Y,
                                    "dist_1_param_1": theta_0,
                                    "dist_1_param_2": theta_1,
                                    "dist_2": dist_RmidY,
                                    "dist_2_param_1": theta_2,
                                    "dist_2_param_2": theta_3,
                                    "lim_inf": lim_inf,
                                    "lim_sup": lim_sup,
                                    "time": tac - tic}
                    modelos.append(modelo_local)

    modelos_df = pd.DataFrame(modelos)
    modelos_df.to_csv(thetas_dir / f"Estim_{label}{ind_iter}.csv", index=False)

    return modelos_df

### Sampleo Gibbs

In [None]:
def genera_datos_gibbs(input_path=None, input_dict=None, df=None,
                       pct=0.15, n_muestras=1000, P_R=0.5):

    if df is None:
        if input_path is None:
            df = pd.DataFrame([input_dict])
        else:
            df = pd.read_csv(input_path)

    if input_path is not None:
        input_path = Path(input_path)
        output_dir = input_path.parent / 'Datos_generados'
        output_dir.mkdir(parents=True, exist_ok=True)

        observed_dir = input_path.parent / 'Datos_observados'
        observed_dir.mkdir(parents=True, exist_ok=True)

    for _, row in df.iterrows():

        sim_id = row['sim_id']
        method = row['method'].lower()

        dist_1 = parametros(row['dist_1'], row['dist_1_param_1'], row['dist_1_param_2'],row['lim_inf'],row['lim_sup'])
        dist_2 = parametros(row['dist_2'], row['dist_2_param_1'], row['dist_2_param_2'],row['lim_inf'],row['lim_sup'])

        if method == 'mezcla':
            P_R_row = row['P_R']
            YR = datos_mezcla(P_R_row, 'Y', dist_1, dist_2, n_muestras=n_muestras*10)

        elif method == 'seleccion':
            YR = datos_seleccion(dist_1, dist_2, n_muestras=n_muestras*10)

        if P_R is not None:
            assert 0 <= P_R <= 1, "P_R debe estar en el intervalo [0,1]"
            idx0 = YR.index[YR["R"] == 0]
            idx1 = YR.index[YR["R"] == 1]

            idx_aus_sampled = np.random.choice(idx0, size=int(n_muestras*(1-P_R)), replace=True)
            idx_obs_sampled = np.random.choice(idx1, size=int(n_muestras*P_R), replace=True)

            df_obs = YR.loc[idx_obs_sampled]
            df_aus = YR.loc[idx_aus_sampled]

            YR = pd.concat([df_obs, df_aus], ignore_index=True)

        if input_path is None:
            return YR

        for col, value in row.items():
            YR[col] = value

        output_path = output_dir / f"{sim_id}.csv"
        YR.to_csv(output_path, index=False)

        YR_seg = YR.copy()
        idx_R = YR_seg.columns.get_loc("R")
        YR_seg.insert(idx_R + 1, "S", YR_seg["R"] )

        idx_aus = YR_seg.index[YR_seg["S"] == 0]
        len_aus = len(idx_aus)
        idx_flip = np.random.choice(idx_aus, size=int(pct * len_aus), replace=False)
        YR_seg.loc[idx_flip, "S"] = 1

        seguimiento_path = output_dir / f"{sim_id}_seg.csv"
        YR_seg.to_csv(seguimiento_path, index=False)

        YR_obs = YR.loc[YR["R"] == 1, "Y"]
        YR_obs.to_csv(observed_dir / f"{sim_id}.csv", index=False)

        YR_obs_seg = YR_seg.loc[(YR_seg["R"] == 0) & (YR_seg["S"] == 1), "Y"]
        YR_obs_seg.to_csv(observed_dir / f"{sim_id}_seg.csv", index=False)

    return YR[['Y', 'R']]

In [None]:
def genera_seguimiento(YR=None, data_dir=None, pct=0.15, n_muestras=1000):

    if YR is None:
      data_dir = Path(data_dir)
      observed_dir = data_dir.parent / 'Datos_observados'
      observed_dir.mkdir(parents=True, exist_ok=True)
      for path in data_dir.glob("*.csv"):
        if path.name.endswith("seg.csv"):
          continue
        YR = pd.read_csv(path)

    YR_seg = YR.copy()

    idx_R = YR_seg.columns.get_loc("R")
    YR_seg.insert((idx_R) + 1, "S", YR_seg["R"] )

    idx_aus = YR_seg.index[YR_seg["S"] == 0]
    len_aus = len(idx_aus)

    idx_flip = np.random.choice(idx_aus, size=int(pct*len_aus), replace=False)
    YR_seg.loc[idx_flip, "S"] = 1

    YR_obs = YR.loc[YR["R"] == 1, "Y"]
    YR_obs_seg = YR_seg.loc[(YR_seg["R"] == 0) & (YR_seg["S"] == 1), "Y"]

    if data_dir is not None:
      sim_id = YR_seg['sim_id'].iloc[0]
      seguimiento_path = path.parent / f"{sim_id}_{pct}seg.csv"
      if pct > 0:
        YR_seg.to_csv(seguimiento_path, index=False)
      YR_obs.to_csv(observed_dir / f"{sim_id}.csv", index=False)
      if pct > 0:
        YR_obs_seg.to_csv(observed_dir / f"{sim_id}_{pct}seg.csv", index=False)

    return YR_obs_seg

In [None]:
def Estima_thetas_gibbs(dir_sim, label, n_iter=1, idx_iter=None, P_R=0.5, seguimiento=0, df_seg=None):

    modelos = []

    label_name = label
    hyp_path = pd.read_csv(dir_sim / f"Hip_{label_name}.csv")
    orig_dir = dir_sim / f"Datos_observados"
    assumed_dists_m, chosen_priors_m, lims_m = get_hypothesis(hyp_path, "Mezcla")
    assumed_dists_s, chosen_priors_s, lims_s = get_hypothesis(hyp_path, "Seleccion")

    ind_seg = f"_seg" if seguimiento==1 else ""
    label = "*.csv"

    if idx_iter is None:
      gibbs_dir = dir_sim / f"Estimaciones_theta_gibbs{ind_seg}"
    else:
      gibbs_dir = dir_sim / f"Ejecuciones/Estimaciones_theta_gibbs{ind_seg}"
    gibbs_dir.mkdir(parents=True, exist_ok=True)

    ind_iter = f"_{idx_iter+1}" if idx_iter is not None else ""

    inter_dir = gibbs_dir / f"Estimaciones_intermedias{ind_seg}"
    inter_dir.mkdir(parents=True, exist_ok=True)

    datos_dir = gibbs_dir / f"Datos_generados_gibbs{ind_seg}"
    datos_dir.mkdir(parents=True, exist_ok=True)

    for path in orig_dir.glob(label):

        if path.name.endswith("seg.csv"):
            continue

        df = pd.read_csv(path)
        name = path.stem

        if seguimiento == 1:
            seg_candidates = list(orig_dir.glob(f"{name}_*seg.csv"))
            if not seg_candidates:
                raise FileNotFoundError(f"No se encontro seguimiento para {name} en {orig_dir}")
            df_seg = pd.read_csv(seg_candidates[0])
        else:
            df_seg = None

        allowed_methods = None
        allowed_model_tag = None
        if name.startswith("Pm"):
            allowed_methods = {"mezcla"}
            allowed_model_tag = "M00"
        elif name.startswith("Ps"):
            allowed_methods = {"selec"}
            allowed_model_tag = "S00"

        method = "mezcla"
        if allowed_methods is None or method in allowed_methods:
            local_df = df.copy()
            for ((hipYobs, hipYaus), priors_list, lims) in zip(assumed_dists_m, chosen_priors_m, lims_m):
                for dicPriors in priors_list:

                    dicPriors_local = dicPriors.copy()
                    sim_id = dicPriors_local.pop("sim_id")
                    if allowed_model_tag and allowed_model_tag not in str(sim_id):
                        continue

                    muestras_posterior = None
                    modelo_bucle = []

                    for idx in range(n_iter):
                        tic = time.time()
                        if idx == 0:
                            muestras_posterior = EstimaMezcla(local_df, hipYobs, hipYaus, dicPriors_local, seguimiento=seguimiento, df_seg=df_seg)
                        else:
                            if (seguimiento == 1) and (not local_obs.empty) and (not local_aus.empty):
                                muestras_posterior = EstimaMezcla(local_obs, hipYobs, hipYaus, dicPriors_local, muestras_posterior, seguimiento=1, df_seg=local_aus)
                            else:
                                muestras_posterior = EstimaMezcla(local_df, hipYobs, hipYaus, dicPriors_local, muestras_posterior, seguimiento=0)
                        tac = time.time()

                        posterior_df = az.extract(muestras_posterior, group="posterior").to_dataframe()

                        theta_0 = posterior_df["theta_0"].mean()
                        theta_1 = posterior_df["theta_1"].mean()
                        theta_2 = dicPriors_local["theta_2"]
                        theta_3 = dicPriors_local["theta_3"]
                        lim_inf = lims[0]
                        lim_sup = lims[1]

                        modelo_local = {"sim_id": f"{name}_{sim_id}_gibbs_{idx}",
                                        "method": "Mezcla",
                                        "P_R": P_R,
                                        "dist_1": hipYobs,
                                        "dist_1_param_1": theta_0,
                                        "dist_1_param_2": theta_1,
                                        "dist_2": hipYaus,
                                        "dist_2_param_1": theta_2,
                                        "dist_2_param_2": theta_3,
                                        "lim_inf": lim_inf,
                                        "lim_sup": lim_sup,
                                        "time": tac-tic}

                        modelo_bucle.append(modelo_local)
                        local_df = genera_datos_gibbs(input_dict=modelo_local, P_R=P_R)
                        local_obs = local_df.loc[local_df["R"] == 1,['Y']]
                        local_aus = genera_seguimiento(YR=local_df, pct=1, n_muestras=1000).to_frame()

                        local_obs.to_csv(datos_dir / f'{name}_{sim_id}_gibbs{ind_seg}{ind_iter}_obs_{idx}.csv', index=False)
                        local_aus.to_csv(datos_dir / f'{name}_{sim_id}_gibbs{ind_seg}{ind_iter}_aus_{idx}.csv', index=False)

                    modelo_bucle_df = pd.DataFrame(modelo_bucle)
                    modelo_bucle_df.to_csv(inter_dir / f'{name}_{sim_id}{ind_seg}{ind_iter}.csv', index=False)

                    modelos.append(modelo_local)


        method = "selec"
        if allowed_methods is None or method in allowed_methods:
            local_df = df.copy()
            for ((dist_Y, dist_RmidY), priors_list, lims) in zip(assumed_dists_s, chosen_priors_s, lims_s):
                for dicPriors in priors_list:

                    dicPriors_local = dicPriors.copy()
                    sim_id = dicPriors_local.pop("sim_id")
                    if allowed_model_tag and allowed_model_tag not in str(sim_id):
                        continue

                    muestras_posterior = None
                    modelo_bucle = []

                    for idx in range(n_iter):

                        tic = time.time()
                        if idx == 0:
                            muestras_posterior = EstimaSeleccion(local_df, dist_Y, dist_RmidY, dicPriors_local, seguimiento=seguimiento, df_seg=df_seg)
                        else:
                            if (seguimiento == 1) and (not local_obs.empty) and (not local_aus.empty):
                                muestras_posterior = EstimaSeleccion(local_obs, dist_Y, dist_RmidY, dicPriors_local, muestras_posterior, seguimiento=1, df_seg=local_aus)
                            else:
                                muestras_posterior = EstimaSeleccion(local_df, dist_Y, dist_RmidY, dicPriors_local, muestras_posterior, seguimiento=0)
                        tac = time.time()

                        posterior_df = az.extract(muestras_posterior, group="posterior").to_dataframe()

                        theta_0 = posterior_df["theta_0"].mean()
                        theta_1 = posterior_df["theta_1"].mean()
                        theta_2 = posterior_df["theta_2"].mean()
                        theta_3 = posterior_df["theta_3"].mean()
                        lim_inf = lims[0]
                        lim_sup = lims[1]

                        modelo_local = {"sim_id": f"{name}_{sim_id}_gibbs_{idx}",
                                      "method": "Seleccion",
                                      "P_R": P_R,
                                      "dist_1": dist_Y,
                                      "dist_1_param_1": theta_0,
                                      "dist_1_param_2": theta_1,
                                      "dist_2": dist_RmidY,
                                      "dist_2_param_1": theta_2,
                                      "dist_2_param_2": theta_3,
                                      "lim_inf": lim_inf,
                                      "lim_sup": lim_sup,
                                      "time": tac-tic}
                        modelo_bucle.append(modelo_local)
                        local_df = genera_datos_gibbs(input_dict=modelo_local, P_R=P_R)
                        local_obs = local_df.loc[local_df["R"] == 1,['Y']]
                        local_aus = genera_seguimiento(YR=local_df, pct=1, n_muestras=1000).to_frame()

                        local_obs.to_csv(datos_dir / f'{name}_{sim_id}_gibbs{ind_seg}{ind_iter}_obs_{idx}.csv', index=False)
                        local_aus.to_csv(datos_dir / f'{name}_{sim_id}_gibbs{ind_seg}{ind_iter}_aus_{idx}.csv', index=False)

                    modelo_bucle_df = pd.DataFrame(modelo_bucle)
                    modelo_bucle_df.to_csv(inter_dir / f'{name}_{sim_id}{ind_seg}{ind_iter}.csv', index=False)

                    modelos.append(modelo_local)

    modelos_df = pd.DataFrame(modelos)
    if modelos_df.empty:
        raise ValueError("No se generaron modelos Gibbs; revisa filtros de metodo y sim_id.")

    modelos_df.to_csv(gibbs_dir / f"Estim_{label_name}_gibbs_{n_iter}_iteraciones{ind_iter}.csv", index=False)

    return modelos_df

## Iterativo

In [None]:
def resultados_to_csv(resultados, out_path, copy_fields, mean_fields):

    agg_estruct = {}
    for f in mean_fields:
        agg_estruct[f] = (f, "mean")
    for f in copy_fields:
        agg_estruct[f] = (f, "first")

    df_resumen = pd.concat(resultados, ignore_index=True)
    df_final = (df_resumen.groupby("sim_id", as_index=False).agg(**agg_estruct))
    df_final = df_final[["sim_id"] + copy_fields + mean_fields]
    df_final.to_csv(out_path, index=False)

In [None]:
def Estima_iterativo(dir, label, n_ejecuciones, n_iteraciones, base=1, seguimiento=1, updatepriors=1, gibbs=1, seg_updatepriors=1, seg_gibbs=1, P_R=0, pct=0.15):

  dir_sim = Path(dir) / f"Simulaciones_{label}"
  dir_sim.mkdir(parents=True, exist_ok=True)

  dir_iter = dir_sim / f"Ejecuciones"
  dir_iter.mkdir(parents=True, exist_ok=True)

  resumen = []

  mean_fields = ["dist_1_param_1","dist_1_param_2","dist_2_param_1","dist_2_param_2","lim_inf","lim_sup","time"]
  copy_fields = ["method","P_R","dist_1","dist_2"]

  if base ==1:
    for i in range(n_ejecuciones):
      print(f'Estimando thetas - Ejecución {i+1}')
      resultados = Estima_thetas(dir_sim, label, idx_iter=i)
      resumen.append(resultados)
    resultados_to_csv(resumen, dir_iter / f'Resumen_{n_ejecuciones}_ejecuciones.csv', copy_fields, mean_fields)
    resumen = []

  if seguimiento == 1:
    for i in range(n_ejecuciones):
      print(f'Estimando thetas con seguimiento - Ejecución {i+1}')
      resultados = Estima_thetas_seguimiento(dir_sim, label, idx_iter=i, pct=pct)
      resumen.append(resultados)
    resultados_to_csv(resumen, dir_iter / f'Resumen_{n_ejecuciones}_ejecuciones_seguimiento.csv', copy_fields, mean_fields)
    resumen = []

  if updatepriors == 1:
    for i in range(n_ejecuciones):
      print(f'Estimando thetas con updatepriors - Ejecución {i+1}')
      resultados = Estima_thetas_updatepriors(dir_sim, label, n_iter=n_iteraciones, idx_iter=i)
      resumen.append(resultados)
    resultados_to_csv(resumen, dir_iter / f'Resumen_{n_ejecuciones}_ejecuciones_updatepriors.csv', copy_fields, mean_fields)
    resumen = []
  if seg_updatepriors==1:
      for i in range(n_ejecuciones):
        print(f'Estimando thetas con updatepriors y datos de seguimiento - Ejecución {i+1}')
        resultados = Estima_thetas_updatepriors(dir_sim, label, n_iter=n_iteraciones, idx_iter=i, seguimiento=1)
        resumen.append(resultados)
      resultados_to_csv(resumen, dir_iter / f'Resumen_{n_ejecuciones}_ejecuciones_updatepriors_seg.csv', copy_fields, mean_fields)
      resumen = []

  if gibbs == 1:
    for i in range (n_ejecuciones):
      print(f'Estimando thetas con gibbs - Ejecución {i+1}')
      resultados = Estima_thetas_gibbs(dir_sim, label, n_iter=n_iteraciones, idx_iter=i, P_R=P_R)
      resumen.append(resultados)
    resultados_to_csv(resumen, dir_iter / f'Resumen_{n_ejecuciones}_ejecuciones_gibbs.csv', copy_fields, mean_fields)
    resumen = []
  if seg_gibbs==1:
      for i in range (n_ejecuciones):
        print(f'Estimando thetas con gibbs y datos de seguimiento - Ejecución {i+1}')
        resultados = Estima_thetas_gibbs(dir_sim, label, n_iter=n_iteraciones, seguimiento=1, idx_iter=i, P_R=P_R)
        resumen.append(resultados)
      resultados_to_csv(resumen, dir_iter / f'Resumen_{n_ejecuciones}_ejecuciones_gibbs_seg.csv', copy_fields, mean_fields)
      resumen = []

# Análisis

In [None]:
def tabla_0(df_model, dicGen, row):
  df_thetas = df_model.loc[df_model["model_method"] == df_model["gen_method"]]
  if df_thetas.empty:
      return None

  orig_method = df_thetas["gen_data"].astype(str).iloc[0]

  dicData = dicGen[orig_method]

  tabla0 = pd.DataFrame({'Dataset':orig_method, 'model':df_thetas['ID_modelo'] ,'priors': df_thetas['ID_prior'], 't_ejecucion':df_thetas['time']})

  theta_0 = dicData['dist_1_param_1']
  theta_1 = dicData['dist_1_param_2']

  err_abs_theta_0 = (theta_0 - df_thetas["dist_1_param_1"]).abs()
  err_rel_theta_0 = (err_abs_theta_0 / df_thetas["dist_1_param_1"].abs()).where(df_thetas["dist_1_param_1"] != 0, 0)

  err_abs_theta_1 = (theta_1 - df_thetas["dist_1_param_2"]).abs()
  err_rel_theta_1 = (err_abs_theta_1 / df_thetas["dist_1_param_2"].abs()).where(df_thetas["dist_1_param_2"] != 0, 0)

  tabla0 = tabla0.assign(err_abs_theta_0=err_abs_theta_0,err_rel_theta_0=err_rel_theta_0,err_abs_theta_1=err_abs_theta_1,err_rel_theta_1=err_rel_theta_1)
  if dicData["method"] == 'Seleccion':

    theta_2 = dicData['dist_2_param_1']
    theta_3 = dicData['dist_2_param_2']

    err_abs_theta_2 = (theta_2 - df_thetas["dist_2_param_1"]).abs()
    err_rel_theta_2 = ((err_abs_theta_2 / df_thetas["dist_2_param_1"].abs()).where((row["ID_modelo"] == "S0") & (df_thetas["dist_2_param_1"] != 0),0))

    err_abs_theta_3 = (theta_3 - df_thetas["dist_2_param_2"]).abs()
    err_rel_theta_3 = ((err_abs_theta_3 / df_thetas["dist_2_param_2"].abs()).where((row["ID_modelo"] == "S0") & (df_thetas["dist_2_param_2"] != 0),0))

    tabla0 = tabla0.assign(err_abs_theta_2=err_abs_theta_2, err_rel_theta_2=err_rel_theta_2, err_abs_theta_3=err_abs_theta_3, err_rel_theta_3=err_rel_theta_3)

  return tabla0

In [None]:
def compute_mean_std_err(media, std, n_muestras, t_ejec, orig_media, orig_std, id):

        err_abs_media = abs(media - orig_media)
        err_rel_media = err_abs_media / abs(orig_media) if orig_media != 0 else 0.0

        err_abs_std = abs(std - orig_std)
        err_rel_std = err_abs_std / abs(orig_std) if orig_std != 0 else 0.0

        return {"ID": id, "t_ejecucion": t_ejec,
                "media": media, "err_abs_media": err_abs_media, "err_rel_media": err_rel_media,
                "std": std, "err_abs_std": err_abs_std, "err_rel_std": err_rel_std,
                "n_muestras": n_muestras}

In [None]:
def tabla_1(data_props, row, pct_seg=0.00, n_muestras=1000, n_iter=100):

    data_props = pd.DataFrame.from_dict(data_props, orient="index")

    orig_method = str(row["sim_id"])[0:2]
    orig_media = data_props.loc[orig_method, "media"]
    orig_std = data_props.loc[orig_method, "std"]
    orig_media_obs = data_props.loc[f'{orig_method}_obs', "media"]
    orig_std_obs = data_props.loc[f'{orig_method}_obs', "std"]
    orig_media_aus = data_props.loc[f'{orig_method}_aus', "media"]
    orig_std_aus = data_props.loc[f'{orig_method}_aus', "std"]

    medias_Y = []
    medias_Yobs = []
    medias_Yaus = []

    stds_Y = []
    stds_Yobs = []
    stds_Yaus = []

    lens_Y = []
    lens_Yobs = []
    lens_Yaus = []

    for i in range(n_iter):

      datos_generados = genera_datos(df=row.to_frame().T, n_muestras=n_muestras)

      datos_Y = datos_generados["Y"]
      datos_Yobs = datos_generados.loc[datos_generados["R"] == 1, "Y"]
      datos_Yaus = datos_generados.loc[datos_generados["R"] == 0, "Y"]

      medias_Y.append(datos_Y.mean())
      medias_Yobs.append(datos_Yobs.mean())
      medias_Yaus.append(datos_Yaus.mean())

      stds_Y.append(datos_Y.std())
      stds_Yobs.append(datos_Yobs.std())
      stds_Yaus.append(datos_Yaus.std())

      lens_Y.append(len(datos_Y))
      lens_Yobs.append(len(datos_Yobs))
      lens_Yaus.append(len(datos_Yaus))

    media_Y = np.mean(medias_Y)
    media_Yobs = np.mean(medias_Yobs)
    media_Yaus = np.mean(medias_Yaus)

    std_Y = np.mean(stds_Y)
    std_Yobs = np.mean(stds_Yobs)
    std_Yaus = np.mean(stds_Yaus)

    len_Y = np.mean(lens_Y)
    len_Yobs = np.mean(lens_Yobs)
    len_Yaus = np.mean(lens_Yaus)

    t_ejec = row["time"]
    pct_seg_str = f"{float(pct_seg):.2f}"

    rows = [
        compute_mean_std_err(media_Y, std_Y, len_Y, t_ejec, orig_media, orig_std, f"{row['ID']}_Y_{pct_seg_str}_seg"),
        compute_mean_std_err(media_Yobs, std_Yobs, len_Yobs, t_ejec, orig_media_obs, orig_std_obs, f"{row['ID']}_Yobs_{pct_seg_str}_seg"),
        compute_mean_std_err(media_Yaus, std_Yaus, len_Yaus, t_ejec, orig_media_aus, orig_std_aus, f"{row['ID']}_Yaus_{pct_seg_str}_seg")
    ]

    tabla1 = pd.DataFrame(rows).set_index("ID")

    return tabla1

In [None]:
def Analiza(dir, resumen_path, out_dir, label, pct_seg=0.00):

    path = Path(dir)
    dir_sim = path / f"Simulaciones_{label}"
    gen_path = dir_sim / f"Gen_{label}.csv"
    data_dir = dir_sim / "Datos_generados"
    resumen_path = Path(resumen_path)
    tablas_dir = Path(out_dir)

    for p in [gen_path, data_dir, resumen_path]:
        if not p.exists():
            raise FileNotFoundError(f"{p} not found")

    resumen_name = resumen_path.name.lower()
    es_seguimiento = ("seguimiento" in resumen_name) or ("_seg" in resumen_name)
    pct_seg_val = float(pct_seg) if es_seguimiento else 0.00
    pct_seg_str = f"{pct_seg_val:.2f}"

    data_props = {}
    for csv_file in data_dir.glob("*.csv"):
        if "_seg" in csv_file.stem:
            continue

        df = pd.read_csv(csv_file)

        datos = df["Y"]
        datos_obs = df.loc[df["R"] == 1, "Y"]
        datos_aus = df.loc[df["R"] == 0, "Y"]

        data_props[f'{csv_file.stem}'] = {"n_muestras":len(datos), "media": datos.mean(),"std": datos.std()}
        data_props[f'{csv_file.stem}_obs'] = {"n_muestras":len(datos_obs), "media": datos_obs.mean(),"std": datos_obs.std()}
        data_props[f'{csv_file.stem}_aus'] = {"n_muestras":len(datos_aus), "media": datos_aus.mean(),"std": datos_aus.std()}

    gen_csv = pd.read_csv(gen_path)
    dicGen = {row["sim_id"]: row.to_dict() for _, row in gen_csv.iterrows()}

    resumen = pd.read_csv(resumen_path)
    resumen["model_method"] = resumen["method"]
    resumen["gen_data"]   = resumen["sim_id"].astype(str).str[0:2]
    resumen["gen_method"] = resumen["sim_id"].astype(str).str[1:2].map({"m": "Mezcla", "s": "Seleccion"})
    resumen["ID_modelo"] = resumen["sim_id"].astype(str).str[3:5]
    resumen["ID_prior"]  = resumen["sim_id"].astype(str).str[5:6]
    resumen["ID"]        = resumen["sim_id"].astype(str).str[0:6]

    for (method, id_model), df_model in resumen.groupby(["model_method", "ID_modelo"]):

        df_model = df_model.reset_index(drop=True)

        tablas = []

        for _, row in df_model.iterrows():

            if row["ID_modelo"] in ("S0", "M0"):
                tabla0 = tabla_0(df_model, dicGen, row)
                if tabla0 is not None:
                  tabla0.insert(1, "pct_seg", pct_seg_str)
                  tabla0.to_csv(tablas_dir / f"{id_model}_tabla0.csv", index=False)

            tablas.append(tabla_1(data_props, row, pct_seg_str))

        tabla1 = pd.concat(tablas, ignore_index=False)
        tabla1.to_csv(tablas_dir / f"{id_model}_tabla1.csv", index=True)

    data_props = pd.DataFrame.from_dict(data_props, orient="index")
    data_props.to_csv(tablas_dir / "tabla_datos.csv", index=True)

    return

# Ejecución

In [None]:
def ejecuta_completo(dir, label, n_ejecuciones=1, n_iteraciones=1, genera=0, pct=0.15, base=1, seguimiento = 1, updatepriors=1, gibbs=1, seg_updatepriors=1, seg_gibbs=1, P_R=0):

  sim_dir = Path(dir) / f"Simulaciones_{label}"
  datos_path = Path(dir) / f"Simulaciones_{label}/Gen_{label}.csv"

  if genera == 1:
    df = genera_datos(datos_path, pct=pct)
    print('Datos generados')

  iter_dir = sim_dir / "Ejecuciones"
  iter_dir.mkdir(parents=True, exist_ok=True)

  try:
    Estima_iterativo(dir, label, n_ejecuciones, n_iteraciones, base=base, seguimiento=seguimiento, updatepriors=updatepriors, gibbs=gibbs, seg_updatepriors=seg_updatepriors, seg_gibbs=seg_gibbs, P_R=P_R, pct=pct)
  except TypeError as e:
    if "pct" in str(e):
      Estima_iterativo(dir, label, n_ejecuciones, n_iteraciones, base=base, seguimiento=seguimiento, updatepriors=updatepriors, gibbs=gibbs, seg_updatepriors=seg_updatepriors, seg_gibbs=seg_gibbs, P_R=P_R)
    else:
      raise

  analisis_dir = sim_dir/"Analisis"
  analisis_dir.mkdir(parents=True, exist_ok=True)

  for path in iter_dir.glob("*.csv"):
    resumen_path = str(path)
    out_path = str(path.name).replace("Resumen_", "")
    out_dir = analisis_dir / out_path
    out_dir.mkdir(parents=True, exist_ok=True)

    try:
      Analiza(dir, resumen_path, out_dir, label, pct_seg=pct)
    except TypeError as e:
      if "pct_seg" in str(e):
        Analiza(dir, resumen_path, out_dir, label)
      else:
        raise

# Simulaciones

In [None]:
dir = '/content/drive/MyDrive/TFM/Simulaciones'
ejecuta_completo(dir, 'A', n_ejecuciones=10, n_iteraciones=10, genera=1, pct=0.20, P_R=0.5)

In [None]:
dir = '/content/drive/MyDrive/TFM/Simulaciones'
ejecuta_completo(dir, 'B', n_ejecuciones=10, n_iteraciones=10, genera=1, pct=0.20, P_R=0.5)