In [8]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.special import logit
from statsmodels.genmod.families import Binomial, Gaussian
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from scipy import stats  # For norm.cdf
import warnings

# Ignore convergence warnings for cleaner output
warnings.simplefilter('ignore', ConvergenceWarning)

# ------------ .verify_args ------------------
def verify_args(Y, A, W, Delta):
    """
    Verify that the input arguments Y, A, W, Delta are valid.

    Parameters:
    - Y: Outcome vector
    - A: Treatment indicator vector
    - W: Covariate matrix/DataFrame
    - Delta: Missingness indicator vector

    Returns:
    - Boolean indicating if arguments are valid
    """
    ok1 = len(Y) == len(A) and len(A) == W.shape[0]
    ok2 = np.all(np.isin(A[~np.isnan(A)], [0, 1]))
    if not ok1:
        print("Warning: 'Y', 'A', 'W' must contain the same number of observations")
    if not ok2:
        print("Warning: 'A' must be binary (0,1)")
    return ok1 and ok2

# ----------- .set_DSAargs ----------------
def set_DSAargs(DSAargs, wts):
    """
    Set default DSA arguments if they are not provided.

    Parameters:
    - DSAargs: Dictionary of DSA arguments
    - wts: Weights vector

    Returns:
    - Updated DSAargs dictionary
    """
    if 'maxsumofpow' not in DSAargs or DSAargs['maxsumofpow'] is None:
        DSAargs['maxsumofpow'] = 2
    if 'maxorderint' not in DSAargs or DSAargs['maxorderint'] is None:
        DSAargs['maxorderint'] = 2
    if 'maxsize' not in DSAargs or DSAargs['maxsize'] is None:
        DSAargs['maxsize'] = 15  # Arbitrary limit
    if 'Dmove' not in DSAargs or DSAargs['Dmove'] is None:
        DSAargs['Dmove'] = False
    if 'Smove' not in DSAargs or DSAargs['Smove'] is None:
        DSAargs['Smove'] = False
    if 'vfold' not in DSAargs or DSAargs['vfold'] is None:
        DSAargs['vfold'] = 5
    if 'formula' not in DSAargs or DSAargs['formula'] is None:
        DSAargs['formula'] = 'Y ~ A'
    if 'family' not in DSAargs or DSAargs['family'] is None:
        DSAargs['family'] = 'gaussian'
    if 'wts' not in DSAargs or DSAargs['wts'] is None:
        DSAargs['wts'] = np.tile(wts, (DSAargs['vfold'] + 1, 1))
    if 'nsplits' not in DSAargs or DSAargs['nsplits'] is None:
        DSAargs['nsplits'] = 1
    if 'silent' not in DSAargs or DSAargs['silent'] is None:
        DSAargs['silent'] = -1
    return DSAargs

# ----------- function logit_trunc ---------
def logit_trunc(x):
    """
    Convert probabilities to logit scale with truncation to avoid infinities.

    Parameters:
    - x: Array-like probabilities

    Returns:
    - Logit-transformed array
    """
    x = np.clip(x, 0, 1)
    # Avoid log(0) by setting extreme values to a small epsilon
    epsilon = 1e-10
    x = np.clip(x, epsilon, 1 - epsilon)
    return np.log(x / (1 - x))

# ----------- estimate_Q ----------------
def estimate_Q(Q, DSAargs, Y, A, W, Delta, family, wts, id):
    """
    Estimate the Q function (E[Y | A, W], E[Y | A=1, W], E[Y | A=0, W]).

    Parameters:
    - Q: Either a matrix of values, a formula string, or None
    - DSAargs: Dictionary of DSA arguments
    - Y: Outcome vector
    - A: Treatment indicator vector
    - W: Covariate matrix/DataFrame
    - Delta: Missingness indicator vector
    - family: Family for GLM ('gaussian' or 'binomial')
    - wts: Weights vector
    - id: ID vector for repeated measures

    Returns:
    - Dictionary containing Q matrix, coefficients, and model type
    """
    print("Estimating Q...")
    m = None
    coef = np.nan
    if isinstance(Q, np.ndarray) and Q.ndim == 2:
        if family == 'binomial':
            Q = logit_trunc(Q)
        # coef remains NA
    else:
        if Q is None:
            # DSA not available, using GLM with main terms
            print("DSA not found, running main terms regression for 'Q' using GLM")
            # Create DataFrame
            df = pd.DataFrame(np.column_stack((Y, A, W)), columns=['Y', 'A'] + [f'W{i+1}' for i in range(W.shape[1])])
            df = df[Delta == 1]
            weights = wts[Delta == 1]
            # Create formula 'Y ~ A + W1 + W2 + W3'
            formula = 'Y ~ ' + ' + '.join(df.columns[2:])
            if family == 'binomial':
                fam = Binomial()
            else:
                fam = Gaussian()
            # Fit GLM with freq_weights
            model = sm.GLM.from_formula(formula, data=df, family=fam, freq_weights=weights)
            m = model.fit()
        else:
            # Q is a formula
            try:
                formula = Q
                print(f"Using user-supplied formula for Q: {formula}")
                df = pd.DataFrame(np.column_stack((Y, A, W)), columns=['Y', 'A'] + [f'W{i+1}' for i in range(W.shape[1])])
                df = df[Delta == 1]
                weights = wts[Delta == 1]
                if family == 'binomial':
                    fam = Binomial()
                else:
                    fam = Gaussian()
                model = sm.GLM.from_formula(formula, data=df, family=fam, freq_weights=weights)
                m = model.fit()
            except Exception as e:
                print("Warning: invalid formula supplied, running main terms regression for 'Q' using GLM")
                df = pd.DataFrame(np.column_stack((Y, A, W)), columns=['Y', 'A'] + [f'W{i+1}' for i in range(W.shape[1])])
                df = df[Delta == 1]
                weights = wts[Delta == 1]
                # Create main terms formula
                formula = 'Y ~ ' + ' + '.join(df.columns[2:])
                if family == 'binomial':
                    fam = Binomial()
                else:
                    fam = Gaussian()
                model = sm.GLM.from_formula(formula, data=df, family=fam, freq_weights=weights)
                m = model.fit()
        # Now predict QAW, Q1W, Q0W
        # Create a full DataFrame with original Y, A, W
        df_new = pd.DataFrame(np.column_stack((Y, A, W)), columns=['Y', 'A'] + [f'W{i+1}' for i in range(W.shape[1])])
        QAW = m.predict(df_new)
        # Create df with A=1
        df_new_A1 = df_new.copy()
        df_new_A1['A'] = 1
        Q1W = m.predict(df_new_A1)
        # Create df with A=0
        df_new_A0 = df_new.copy()
        df_new_A0['A'] = 0
        Q0W = m.predict(df_new_A0)
        # Combine into matrix
        Q = np.column_stack((QAW, Q1W, Q0W))
        coef = m.params
    print("Q estimation completed.")
    return {'Q': Q, 'coef': coef, 'type': type(m).__name__ if m else 'None'}

# ----------- estimate_g ----------------
def estimate_g(g, DSAargs, A, W, Delta, wts, id):
    """
    Estimate the treatment mechanism g_A or missingness mechanism g_M.

    Parameters:
    - g: Either a numeric vector, a formula string, or None
    - DSAargs: Dictionary of DSA arguments
    - A: Treatment indicator vector (for g_A) or Delta vector (for g_M)
    - W: Covariate matrix/DataFrame
    - Delta: Missingness indicator vector
    - wts: Weights vector
    - id: ID vector for repeated measures

    Returns:
    - Dictionary containing g1W probabilities, coefficients, and model type
    """
    print("Estimating g...")
    m = None
    coef = np.nan
    if not isinstance(g, np.ndarray):
        if np.all(A == A[0]):
            g1W = np.ones(len(A))
            coef = np.nan
        else:
            if g is None:
                # DSA not available, using GLM with main terms
                print("DSA not found, running main terms regression for 'g' using GLM")
                # Create DataFrame
                df = pd.DataFrame(np.column_stack((A, W)), columns=['A'] + [f'W{i+1}' for i in range(W.shape[1])])
                df = df[Delta == 1]
                weights = wts[Delta == 1]
                # Create formula 'A ~ W1 + W2 + W3'
                formula = 'A ~ ' + ' + '.join(df.columns[1:])
                # Fit GLM with binomial family and freq_weights
                model = sm.GLM.from_formula(formula, data=df, family=Binomial(), freq_weights=weights)
                m = model.fit()
            else:
                # g is a formula
                try:
                    formula = g
                    print(f"Using user-supplied formula for g: {formula}")
                    df = pd.DataFrame(np.column_stack((A, W)), columns=['A'] + [f'W{i+1}' for i in range(W.shape[1])])
                    df = df[Delta == 1]
                    weights = wts[Delta == 1]
                    model = sm.GLM.from_formula(formula, data=df, family=Binomial(), freq_weights=weights)
                    m = model.fit()
                except Exception as e:
                    print("Warning: invalid formula supplied, running main terms regression for 'g' using GLM")
                    df = pd.DataFrame(np.column_stack((A, W)), columns=['A'] + [f'W{i+1}' for i in range(W.shape[1])])
                    df = df[Delta == 1]
                    weights = wts[Delta == 1]
                    # Create main terms formula 'A ~ W1 + W2 + W3'
                    formula = 'A ~ ' + ' + '.join(df.columns[1:])
                    model = sm.GLM.from_formula(formula, data=df, family=Binomial(), freq_weights=weights)
                    m = model.fit()
            # Now predict g1W on all data
            df_new = pd.DataFrame(np.column_stack((A, W)), columns=['A'] + [f'W{i+1}' for i in range(W.shape[1])])
            g1W = m.predict(df_new)
            coef = m.params
    else:
        g1W = g
        coef = np.nan
    print("g estimation completed.")
    return {'g1W': g1W, 'coef': coef, 'type': type(m).__name__ if m else 'None'}

# ------------------------------- tmle ----------------------------------------
def tmle(Y, A, W, Delta=None, id=None, Q=None, g_A=None, g_M=None, wts=None, DSAargs=None, family='gaussian', epsilon=None):
    """
    Estimate the marginal treatment effect using TMLE.

    Parameters:
    - Y: Outcome vector
    - A: Binary treatment indicator vector (1-treatment, 0-control)
    - W: Covariate matrix/DataFrame
    - Delta: Indicator of missing outcome or treatment assignment (1-observed, 0-missing)
    - id: ID vector identifying repeated measures
    - Q: Either a matrix of Q-values, a formula string, or None
    - g_A: Either a vector of P(A=1 | W), a formula string, or None
    - g_M: Either a vector of P(Delta=1 | W), a formula string, or None
    - wts: Weights vector
    - DSAargs: Dictionary of DSA arguments
    - family: Family for GLM ('gaussian' or 'binomial')
    - epsilon: Epsilon value for targeting step (optional)

    Returns:
    - Dictionary containing TMLE estimates and related information
    """
    print("\nStarting TMLE computation...")
    if Delta is None:
        Delta = np.ones(len(Y))
    if id is None:
        id = np.arange(len(Y))
    if wts is None:
        wts = np.ones(len(Y))
    if DSAargs is None:
        DSAargs = {}
    psi_tmle = varIC = CI = pvalue = np.nan
    W = np.asarray(W)
    if verify_args(Y, A, W, Delta):
        # Estimate Q
        Q_est = estimate_Q(Q, DSAargs, Y, A, W, Delta, family, wts, id)
        # Set DSAargs formula to 'A ~ 1' for g_A estimation
        DSAargs['formula'] = 'A ~ 1'
        # Estimate g_A
        g_A_est = estimate_g(g_A, DSAargs, A, W, Delta, wts, id)
        # Estimate g_M, which is P(Delta=1 | W)
        # In R code, A is set to Delta, and Delta is set to 1
        # So in Python, set A parameter to Delta and Delta parameter to ones
        g_M_est = estimate_g(g_M, DSAargs, A=Delta, W=W, Delta=np.ones(W.shape[0]), wts=wts, id=id)
        g1W = g_A_est['g1W']
        # Calculate h, h1W, h0W
        h = h1W = (1 / g1W) * (Delta / g_M_est['g1W'])
        h0W = (-1 / (1 - g1W)) * (Delta / g_M_est['g1W'])
        h[A == 0] = h0W[A == 0]
        if epsilon is None:
            print("Calculating epsilon...")
            # Create DataFrame for epsilon estimation
            df = pd.DataFrame({'Y': Y, 'Q': Q_est['Q'][:, 0], 'h': h, 'Delta': Delta})
            df = df[df['Delta'] == 1]
            weights = wts[df.index]  # Correctly subset weights
            if family == 'binomial':
                fam = Binomial()
            else:
                fam = Gaussian()
            # Fit GLM with offset and no intercept (-1 + h)
            # Formula in R: Y ~ -1 + offset(Q) + h
            # In statsmodels, use 'Y ~ -1 + h', with offset parameter
            model = sm.GLM.from_formula('Y ~ -1 + h', data=df, family=fam, offset=df['Q'], freq_weights=weights)
            epsilon_model = model.fit()
            epsilon = epsilon_model.params['h']
            print(f"Calculated epsilon: {epsilon}")
        # Update QAW, Q1W, Q0W
        QAW = Q_est['Q'][:, 0] + epsilon * h
        Q1W = Q_est['Q'][:, 1] + epsilon * h1W
        Q0W = Q_est['Q'][:, 2] + epsilon * h0W
        # Back-transform if binomial
        if family == 'binomial':
            QAW = sm.families.links.logit().inverse(QAW)
            Q1W = sm.families.links.logit().inverse(Q1W)
            Q0W = sm.families.links.logit().inverse(Q0W)
        # Compute psi
        psi_tmle = np.mean(Q1W) - np.mean(Q0W)
        print(f"Estimated psi (treatment effect): {psi_tmle}")
        # Replace missing Y with QAW
        Y_copy = Y.copy()
        Y_copy[np.isnan(Y)] = QAW[np.isnan(Y)]
        # Compute influence curve
        IC = (Y_copy - QAW) * h * Delta + Q1W - Q0W - psi_tmle
        # Group by id and take mean
        IC = pd.Series(IC).groupby(id).mean().values
        # Replace NaN or Inf with Inf
        IC[np.isnan(IC) | np.isinf(IC)] = np.inf
        # Variance
        varIC = np.var(IC)
        var_psi = varIC / len(np.unique(id))
        # Confidence Interval
        CI = [psi_tmle - 1.96 * np.sqrt(var_psi), psi_tmle + 1.96 * np.sqrt(var_psi)]
        # P-value
        pvalue = 2 * (1 - stats.norm.cdf(abs(psi_tmle / np.sqrt(var_psi))))
        # Counterfactual Q
        Qcounter = np.column_stack((Q1W, Q0W))
        print("TMLE computation completed.\n")
        return {
            'psi': psi_tmle,
            'var': var_psi,
            'pvalue': pvalue,
            'CI': CI,
            'epsilon': epsilon,
            'Q': Q_est,
            'g_A': g_A_est,
            'g_M': g_M_est,
            'Qcounterfactual': Qcounter
        }


# ---------------------------- Sample calls to tmle function ------------------------
# Note: Generate data before running the examples!

# ------------ generate data --------------
np.random.seed(10)
n = 500
W = np.random.normal(size=(n, 3))
A = np.random.binomial(1, 1 / (1 + np.exp(-(.1 * W[:, 0] - .1 * W[:, 1] + .5 * W[:, 2]))))
Y = A + 2 * W[:, 0] + W[:, 2] + W[:, 1] ** 2 + np.random.normal(size=n)
# Column names for pandas DataFrame
W_columns = ['W1', 'W2', 'W3']

# --------------------------------------------------------
# Example 1: Default function invocation
# Invokes GLM to estimate Q, g_A, g_M,
# because Delta argument is not supplied, assumes (Y,A) observed for all obs
result1 = tmle(Y, A, W)

# --------------------------------------------------------
# Example 2: Binary outcome, GLM estimates Q
# Known g_A = 0.5 is user-supplied,
A_ex2 = np.random.binomial(1, 0.5, size=n)
Y_ex2 = A_ex2 + 2 * W[:, 0] + W[:, 2] + W[:, 1] ** 2 + np.random.normal(size=n)
result2 = tmle(Y=Y_ex2, A=A_ex2, W=W, g_A=np.full(len(Y_ex2), 0.5))

# --------------------------------------------------------
# Example 3: Supplying an indicator for observations missing the outcome
# Set Delta to 1 for obs where Y is observed, 0 when Y is missing
# In this example, Delta is set to indicate 20% missing values, MCAR
Delta = np.random.binomial(1, 0.8, size=n)
result3 = tmle(Y, A, W, Delta=Delta)

# --------------------------------------------------------
# Example 4: User-supplied (misspecified) model for Q, GLM estimates for g_A, g_M
# Approx. 20% missing, MAR
Delta = np.random.binomial(1, 1 / (1 + np.exp(-(1.7 - 1 * W[:, 0]))), size=n)
formula_Q = 'Y ~ A + W1 + W2 + W3'
result4 = tmle(Y, A, W, Delta=Delta, Q=formula_Q)

# --------------------------------------------------------
# Example 5: User-supplied models for g_A and missingness mechanism g_M,
# GLM estimates Q.
# 100 unique IDs supplied
# Usage note: use "A" for dependent variable name in the formula for g_M
Delta = np.random.binomial(1, 1 / (1 + np.exp(-(1.6 - 1 * W[:, 0]))), size=n)
id_array = np.repeat(np.arange(1, 101), repeats=n // 100 + 1)[:n]
formula_g_A = 'A ~ W1 + W2 + W3'
formula_g_M = 'A ~ W1'  # Note: 'A' is used as the dependent variable in g_M
result5 = tmle(Y, A, W, Delta=Delta, g_A=formula_g_A, g_M=formula_g_M, id=id_array)

# --------------------------------------------------------
# Function to display summaries of the results
def summary(result):
    """
    Print a summary of the TMLE results.

    Parameters:
    - result: Dictionary returned by the tmle function
    """
    if result is None:
        print("No result to display.")
        return
    print("psi (treatment effect estimate):", result['psi'])
    print("p-value:", result['pvalue'])
    print("Confidence Interval:", result['CI'])
    print("Variance:", result['var'])
    print("Epsilon:", result['epsilon'])
    print("Coefficients for Q_n^0(A,W):")
    print(result['Q']['coef'])
    print("Coefficients for g_A(1,W):")
    print(result['g_A']['coef'])
    print("Coefficients for g_M(1,A,W):")
    print(result['g_M']['coef'])

# Displaying the results
print("Result 1:")
summary(result1)
print("\nResult 2:")
summary(result2)
print("\nResult 3:")
summary(result3)
print("\nResult 4:")
summary(result4)
print("\nResult 5:")
summary(result5)



Starting TMLE computation...
Estimating Q...
DSA not found, running main terms regression for 'Q' using GLM
Q estimation completed.
Estimating g...
DSA not found, running main terms regression for 'g' using GLM
g estimation completed.
Estimating g...
g estimation completed.
Calculating epsilon...
Calculated epsilon: 0.24959117128400143
Estimated psi (treatment effect): 0.5871825884426727
TMLE computation completed.


Starting TMLE computation...
Estimating Q...
DSA not found, running main terms regression for 'Q' using GLM
Q estimation completed.
Estimating g...
g estimation completed.
Estimating g...
g estimation completed.
Calculating epsilon...
Calculated epsilon: 0.2562278791444066
Estimated psi (treatment effect): 0.5165554043551235
TMLE computation completed.


Starting TMLE computation...
Estimating Q...
DSA not found, running main terms regression for 'Q' using GLM
Q estimation completed.
Estimating g...
DSA not found, running main terms regression for 'g' using GLM
g estimati