In [None]:
def backdoor_adjustment_opt(df, Y, y, A, a, Z):
    prob = 0  # Initialize the probability to 0
    total_len = len(df)  # Total number of observations in the dataframe
    total_relevant_Z = 0  # Counter for the total number of observations relevant for the confounders Z
    unique_Z_combinations = df[Z].drop_duplicates()  # Get unique combinations of confounder values
    
    # Iterate over each unique combination of confounder values
    for z_values in unique_Z_combinations.itertuples(index=False):
        mask_Z = np.ones(len(df), dtype=bool)  # Initialize a mask to select rows corresponding to the current combination of Z values
        
        # Create the mask for the current Z values
        for column, value in zip(Z, z_values):
            mask_Z = mask_Z & (df[column] == value)  # Update the mask to select rows where the current confounder matches its value in the current combination
        
        df_Z = df[mask_Z]  # Apply the mask to filter the dataframe for the current Z values
        df_A_a_Z = df_Z[df_Z[A] == a]  # Further filter the dataframe for rows where A equals the intervention value a

        # If there are rows matching the current Z values and A=a, calculate the conditional probability of Y=y
        if not df_A_a_Z.empty:
            p_Y_given_A_Z = (df_A_a_Z[Y] == y).sum() / len(df_A_a_Z)  # Calculate P(Y=y|A=a,Z)
            p_Z = len(df_Z) / total_len  # Calculate P(Z), the probability of the current combination of Z values
            total_relevant_Z += len(df_Z)  # Update the count of total relevant observations for Z
            prob += p_Y_given_A_Z * p_Z  # Accumulate the weighted probability
            
    # Adjust the final probability based on the proportion of observations that were relevant for the Z values
    # This line gurantees sum to 1
    if total_relevant_Z > 0:
        prob = prob * total_len / total_relevant_Z  # Adjust the probability to account for the distribution of Z values in the data

    return prob  # Return the adjusted probability


In [None]:
def get_prob_backdoor_opt_edit2(G, df, k, update_vars, target_column, condition, opt, row_indexes):
    ### get the updated dataframe
    updated_df = ranking_funcs.get_ranking_query(G, df, len(df), update_vars, target_column, condition, opt)
    ### the updated variable
    node = list(update_vars.keys())[0]
    results = []
    ###  theta 2 to theta k list
    theta_lst = updated_df[target_column][1:len(df)+1].tolist()
    ###  theta 1 to theta k-1 list
    prev_theta_lst = updated_df[target_column][0:len(df)].tolist()
    ### find the backdoor set path
    bd_set = ranking_funcs.find_backdoor_sets_opt(G, target_column, node)[0]
    #### get unique values of Y
    dom_y = updated_df[target_column].unique()
    #### get unique values of updated variable
    dom_node = updated_df[node].unique()
    ### iteration to get the probaility Sum 𝑃𝑟 (𝑌 = 𝑦|𝑑𝑜 (𝑋 ) = 𝑥, 𝑍 = 𝑧𝑖 )𝑃𝑟 (𝑍 =𝑧𝑖 ) to calculate 𝑃𝑟 (𝑌 = 𝑦|𝑑𝑜 (𝑋 ) = 𝑥)
    for d_y in dom_y:
        for d_n in dom_node:
            adjusted_prob = ranking_funcs.backdoor_adjustment_opt(updated_df, target_column, d_y, node, d_n, list(bd_set))
            results.append({
                'Y': target_column, 
                'Y_value': d_y, 
                'X': node, 
                'X_value': d_n, 
                'Z': ', '.join(bd_set), 
                'prob': adjusted_prob
            })
    ## get the probability dataframe
    prob_df = pd.DataFrame(results)
    
    prob_groups = []
    for row_index in row_indexes:
        row = updated_df.loc[row_index]                    
        x_value = row[node]
        prob_sum = 0
        for i in range(k-1):
            ### filter the Y values >𝜃i and corresponding probability
            z_relevant_probs = prob_df[(prob_df['Y_value'] > theta_lst[i])]
            ### filter the Y values (>𝜃i and <=𝜃i −1) and corresponding probability
            z_relevant_probs2 = z_relevant_probs[(z_relevant_probs['Y_value'] <= prev_theta_lst[i])]
            ### Pr𝐷,𝑓𝑈 𝑓 (𝜃i) - Pr𝐷,𝑓𝑈 (𝑓 (𝜃i) ∧ 𝑓 (𝜃i −1)) (sum through the iteration)
            ### Sum 𝑃𝑟 (𝑌 = 𝑦𝑗 |𝑑𝑜 (𝑋 ) = 𝑥) with 𝑦𝑗 ≥ 𝜃i - 𝑃𝑟 (𝑌 = 𝑦𝑗 |𝑑𝑜 (𝑋 ) = 𝑥) with 𝑦𝑗 ≥ 𝜃i and yj<=𝜃i-1 (is same to the above)
            prob_sum += z_relevant_probs[(z_relevant_probs['X_value'] == x_value)]['prob'].sum()-z_relevant_probs2[(z_relevant_probs2['X_value'] == x_value)]['prob'].sum()
        prob_groups.append(prob_sum)
        ### get the product of probability of each tupple
    return m.prod(prob_groups)

In [None]:
def get_prob_backdoor_opt_pred(G, df, k, update_vars, target_column, condition, opt):
    """
    G: causal graph
    df:dataframe
    k: the top k
    update_vars: the variable used for update
    target_column: the column the ranking with
    condition: the condition for updating
    opt: must be one of 'fix','add','subs','multiply_by'or 'divided_by'
    row_indexes: the tuples
    """
    ### get the updated dataframe
    updated_df = ranking_funcs.get_ranking_query(G, df, len(df), update_vars, target_column, condition, opt)
    ### the updated variable
    node = list(update_vars.keys())[0]
    results = []
    ### theta 2 to theta n
    theta_lst = updated_df[target_column][1:len(df)+1].tolist()
    ###  theta 1 to theta n-1 list
    prev_theta_lst = updated_df[target_column][0:len(df)].tolist()
    ### find the backdoor set path
    bd_set = ranking_funcs.find_backdoor_sets_opt(G, target_column, node)[0]
    ### get all the ranking tupple combination, n choose k
    tupple_combs = list(combinations(df.index, k))
    #### get unique values of Y
    dom_y = updated_df[target_column].unique()
    #### get unique values of updated variable
    dom_node = updated_df[node].unique()
    ### iteration to get the probaility Sum 𝑃𝑟 (𝑌 = 𝑦|𝑑𝑜 (𝑋 ) = 𝑥, 𝑍 = 𝑧𝑖 )𝑃𝑟 (𝑍 =𝑧𝑖 ) to calculate 𝑃𝑟 (𝑌 = 𝑦|𝑑𝑜 (𝑋 ) = 𝑥)
    for d_y in dom_y:
        for d_n in dom_node:
            adjusted_prob = ranking_funcs.backdoor_adjustment_opt(updated_df, target_column, d_y, node, d_n, list(bd_set))
            results.append({
                'Y': target_column, 
                'Y_value': d_y, 
                'X': node, 
                'X_value': d_n, 
                'Z': ', '.join(bd_set), 
                'prob': adjusted_prob
            })
    ## get the probability dataframe
    prob_df = pd.DataFrame(results)
    prob_groups = []
    ### i the ranking combos
    for row_indexes in tupple_combs:
        for i in range(len(df)-1):
            ### filter the Y values >𝜃i and corresponding probability
            z_relevant_probs = prob_df[(prob_df['Y_value'] > theta_lst[i])]
            ### filter the Y values (>𝜃i and <=𝜃i −1) and corresponding probability
            z_relevant_probs2 = z_relevant_probs[(z_relevant_probs['Y_value'] <= prev_theta_lst[i])]
            pro_prod = []
            prob_sum = 1
            for row_index in row_indexes:
                row = updated_df.loc[row_index]                    
                x_value = row[node]
                ### product Pr𝐷,𝑓𝑈 𝑓 (𝜃i) - Pr𝐷,𝑓𝑈 (𝑓 (𝜃i) ∧ 𝑓 (𝜃i −1)) for each tupple
                prob_sum *= z_relevant_probs[(z_relevant_probs['X_value'] == x_value)]['prob'].sum()-z_relevant_probs2[(z_relevant_probs2['X_value'] == x_value)]['prob'].sum()
            #get the product Pr𝐷,𝑓𝑈 𝑓 (𝜃i) - Pr𝐷,𝑓𝑈 (𝑓 (𝜃i) ∧ 𝑓 (𝜃i −1)) for tuples in a tuple group
            pro_prod.append(prob_sum)
        ### get the sum of it by iterating all the thetas
        prob_groups.append(sum(pro_prod))
    ranking_prob_df = {'ranking_combos':tupple_combs,'prob_in_top_k':prob_groups}.sort_values(by='prob_in_top_k',ascending=False)
    ### get the highest ranking_combo
    return ranking_prob_df.head(1)

In [None]:
def predict_backdoor_opt(G, df, k, update_vars, target_column, condition, opt):
    ### get the updated dataframe
    updated_df = ranking_funcs.get_ranking_query(G, df, len(df), update_vars, target_column, condition, opt)
    ### the updated variable
    node = list(update_vars.keys())[0]
    results = []
    ### find the one of the backdoor set of updated variable
    bd_set = ranking_funcs.find_backdoor_sets_opt(G, target_column, node)[0]
    dom_y = updated_df[target_column].unique()
    dom_node = updated_df[node].unique()
    for d_y in dom_y:
        for d_n in dom_node:
            adjusted_prob = ranking_funcs.backdoor_adjustment_opt(updated_df, target_column, d_y, node, d_n, list(bd_set))
            results.append({
                'Y': target_column, 
                'Y_value': d_y, 
                'X': node, 
                'X_value': d_n, 
                'Z': ', '.join(bd_set), 
                'prob': adjusted_prob
            })
    ## get the probability dataframe
    prob_df = pd.DataFrame(results)
    group_df['expected_value']=group_df['prob']*group_df[target_column]
    prob_df = group_df.groupby([node]).agg({'expected_value': 'sum'}).reset_index()
    
    expected_values = []
    for row_index, row in updated_df.iterrows():                  
        match_conditions = row[node]
        matched_row = prob_df[prob_df[node] == match_conditions]
        if not matched_row.empty:
            expected_value = matched_row['expected_value'].values[0]
        else:
            expected_value=0
        expected_values.append(expected_value)
        
    result_df = pd.DataFrame({'row_index': updated_df.index, 'expected_value': expected_values})
    return result_df.sort_values(by='expected_value', ascending=False).head(k)


In [None]:
def predict_backdoor_opt2(G, df, k, update_vars, target_column, condition, opt):
    """
    Use P(Y|do(X),Z) to estimate
    """
    updated_df = get_ranking_query(G, df, len(df), update_vars, target_column, condition, opt)
    node = list(update_vars.keys())[0]
    results = []
    bd_set = ranking_funcs.find_backdoor_sets_opt(G, target_column, node)[0]
    dom_y = updated_df[target_column].unique()
    dom_node = updated_df[node].unique()
    for d_y in dom_y:
        for d_n in dom_node:
            result_df = backdoor_adjustment_opt2(updated_df, target_column, d_y, node, d_n, list(bd_set))
            if not result_df.empty:
                results.append(result_df)

    merged_df = pd.concat(results, ignore_index=True)
    flat_bd_sets = [col for subset in bd_sets for col in subset]+[node]
    grouped_df = merged_df.groupby(flat_bd_sets).agg({'expected_value': 'sum'}).reset_index()

    expected_values = []
    for row_index, row in updated_df.iterrows():
        match_conditions = {col: row[col] for col in flat_bd_sets}
        matched_row = grouped_df[(grouped_df[list(match_conditions)] == pd.Series(match_conditions)).all(axis=1)]
        if not matched_row.empty:
            expected_value = matched_row['expected_value'].values[0]
        expected_values.append(expected_value)

    result_df = pd.DataFrame({'row_index': updated_df.index, 'expected_value': expected_values})
    return result_df.sort_values(by='expected_value', ascending=False).head(k)


In [None]:
def get_prob_backdoor_opt2(G, df, k, update_vars, target_column, condition, opt, row_indexes, theta):
    """
    Use P(Y|do(X),Z) to estimate
    """
    updated_df = get_ranking_query(G, df, len(df), update_vars, target_column, condition, opt)
    node = list(update_vars.keys())[0]
    results = []
    bd_set = ranking_funcs.find_backdoor_sets_opt(G, target_column, node)[0]
    dom_y = updated_df[target_column].unique()
    dom_node = updated_df[node].unique()
    for d_y in dom_y:
        for d_n in dom_node:
            result_df = backdoor_adjustment_opt2(updated_df, target_column, d_y, node, d_n, list(bd_set))
            if not result_df.empty:
                results.append(result_df)
    merged_df = pd.concat(results, ignore_index=True)
    flat_bd_sets = [col for subset in bd_sets for col in subset]+[node]
    filtered_merged_df=merged_df[merged_df['Y_value']>=theta]
    prob_df=filtered_merged_df.groupby(flat_bd_sets).agg({'probs': 'sum'}).reset_index()
    
    total_probs = []
    for row_index in row_indexes:
        row = updated_df.loc[row_index]                    
        match_conditions = {col: row[col] for col in flat_bd_sets}
        matched_row = prob_df[(prob_df[list(match_conditions)] == pd.Series(match_conditions)).all(axis=1)]
        if not matched_row.empty:
            total_prob = matched_row['probs'].values[0]
        else:
            total_prob=0
        total_probs.append(total_prob)
        
    result_df = pd.DataFrame({'row_index': row_indexes, 'total_probs': total_probs})
    return result_df['total_probs'].prod()