In [13]:
import bnlearn as bn
import pandas as pd
import numpy as np

import logging

# Reduce the console logging level for these libraries
logging.getLogger("bnlearn").setLevel(logging.ERROR)


import numpy as np

class Data:
    """Simple container for the number of variables n."""
    def __init__(self, n):
        self.n = n


class GobnilpScores:
    """
    A Scores-like class that wraps the output of parse_gobnilp_jkl 
    for use with sumu's candidate parent algorithms.
    """
    def __init__(self, parsed_scores):
        """
        Args:
            parsed_scores (dict): 
                A dict of { node: [ (score, (parents...)), ... ], ... }
        """
        self.n = max(parsed_scores.keys()) + 1
        self.data = Data(self.n)
        
        # Store local scores in { node: {parents_tuple: score} }
        self.local_scores = {}
        for node, sp_list in parsed_scores.items():
            self.local_scores[node] = {}
            for (score, parents) in sp_list:
                parents_sorted = tuple(sorted(parents))
                self.local_scores[node][parents_sorted] = score

        # If you do not have a known maximum parent set size, keep this -1
        self.maxid = -1

    def local(self, v, parents):
        """
        Sumu calls 'scores.local(...)' in the candidate generation.
        So, we must provide this method name exactly.
        """
        p_sorted = tuple(sorted(parents))
        return self.local_scores[v].get(p_sorted, float("-inf"))

    def all_candidate_restricted_scores(self, C):
        """
        Return a dictionary: node -> {parents_tuple: local_score},
        for parent sets that are subsets of C[node].
        Handles both dict and np.ndarray for C.
        """



        V = len(C)  # number of nodes
        # Suppose each node i has M_i subsets...
        # For it to be a uniform 2D array, you need the same number of columns for each node,
        # often 2^|C[i]| if you consider all subsets, or something that sumu’s "opt" expects.
        
        # Let's say we do the maximum number of subsets among all i
        max_subset_count = max(2 ** len(C[i]) for i in range(V))

        # Make a big 2D array for (V, max_subset_count)
        arr = np.full((V, max_subset_count), float("-inf"), dtype=float)

        # For each node i:
        for i in range(V):
            # Enumerate subsets of C[i] in some order
            # Suppose subsets_i is a list of (subset_tuple, score)
            subsets_i = []
            for parents_tuple, sc in self.local_scores[i].items():
                # only keep subsets that are within C[i]
                if set(parents_tuple).issubset(C[i]):
                    subsets_i.append((parents_tuple, sc))

            # Sort them in a stable order and store them
            # Typically sumu's "opt" uses bit-encodings, so you'd want j to match that encoding.
            # For simplicity, let's just do enumerated:
            for j, (parents_tuple, sc) in enumerate(subsets_i):
                arr[i, j] = sc

            # If subsets_i has fewer than max_subset_count, the rest remain -inf

        return arr


    def sum(self, v, U, T):

        from itertools import combinations

        # Compute the union of sets U and T
        combined_parents = U | T  # This will create a single set containing all elements from U and T

        # Determine the maximum number of parents (no limit in this case)
        max_parents = len(combined_parents)

        total_score = float("-inf")

        # Iterate over all possible parent sets from the union of U and T
        for k in range(max_parents + 1):
            for parent_set in combinations(combined_parents, k):
                score = self.local(v, parent_set)
                total_score = np.logaddexp(total_score, score)

        return total_score

    def clear_cache(self):
        """If your scoring logic uses caching, clear it here; otherwise do nothing."""
        pass
    



    def clear_cache(self):
        """If your scoring logic uses caching, clear it here; otherwise do nothing."""
        pass
    


def parse_gobnilp_jkl(file_path):
    """
    Parse the Gobnilp .jkl file format.

    Args:
    - file_path (str): Path to the .jkl file.

    Returns:
    - dict: Dictionary where keys are the number of parents and values are tuples of (score, parent_nodes).
    """
    scores = {}
    current_node = None

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue  # Skip empty lines

            parts = line.split()
            if len(parts) == 1 and parts[0].isdigit():
                # Metadata line (e.g., "8"), skip it
                continue
            elif len(parts) == 2 and not line.startswith("-"):
                # Node header line (e.g., "6 64")
                try:
                    current_node = int(parts[0])  # Node ID
                    scores[current_node] = []  # Initialize an empty list for this node
                except ValueError:
                    print(f"Unexpected node header: {line}")
            elif current_node is not None and len(parts) >= 2:
                # Score line, handle it
                try:
                    score = float(parts[0])  # First part is the score
                    num_parents = int(parts[1])  # Second part is the number of parents
                    parent_nodes = tuple(map(int, parts[2:])) if num_parents > 0 else ()
                    scores[current_node].append((score, parent_nodes))
                except ValueError:
                    print(f"Invalid score or parent set line: {line}")
            else:
                print(f"Unrecognized line: {line}")

    return scores









###########################
# 1. Coverage Fraction
###########################
def coverage_fraction(candidate_parents, sampled_dags):
    """
    Returns the fraction of sampled DAGs whose parent sets
    are all subsets of candidate_parents[node].
    """
    count_covered = 0
    total = len(sampled_dags)
    
    for dag in sampled_dags:
        covered = True
        for node, parents in dag.items():
            # If the DAG's parents for this node go beyond what
            # the candidate set allows, it's not covered
            if not parents.issubset(candidate_parents[node]):
                covered = False
                break
        if covered:
            count_covered += 1
    
    return count_covered / total if total > 0 else 0.0
from sumu.candidates import candidate_parent_algorithm as cpa
def top_heuristic(scores,K):
    """
    Example heuristic that returns "every other node is a candidate parent."
    Replace this with your real heuristic, e.g., sumu's 'pc', 'mb', etc.
    """
    algo = cpa["top"]
    top_candidate_parents = algo(
        K,
        scores=scores,     # your sumu-compatible scoring object
        n=scores.data.n    # number of nodes
    )
    return top_candidate_parents


###########################
# 2. Repeated Pipeline
###########################
def estimate_expected_coverage(
    n_reps,            # how many datasets to sample
    n_samples_data,    # how many data points in each dataset
    # path to a .bif file or other BN structure
      # function, e.g. coverage_fraction
# function to learn/sample DAGs from data
):
    """
    Repeatedly:
      1) Generate or sample data from a known BN model (or from some process).
      2) Use 'heuristic_fn' to get candidate parents (optional).
      3) Learn or sample DAGs (using 'dag_learning_fn').
      4) Compute coverage fraction.
    Return the average coverage fraction across multiple data draws.
    """
    asia_model = bn.import_DAG('asia')


    coverage_values = []
    for rep in range(n_reps):
        print(f"=== Data draw #{rep+1} of {n_reps} ===")

        # 2) Simulate a fresh dataset from the BN
        df_data = bn.sampling(asia_model, n=n_samples_data)
        import pandas as pd
        df= pd.DataFrame(df_data)
        

        columns = [str(i) for i in range(len(df.columns))]
        print(columns)
        data =[df[col].nunique() for col in df.columns]
        data=[data]
        # data = [[2] *( len(columns)+0)]
        # Convert to DataFrame
        df_arity = pd.DataFrame(data, columns=columns)

        column_mapping = {old: new for old, new in zip(df.columns, df_arity.columns)}
        df = df.rename(columns=column_mapping)

        # Now, combine the data as before
        df_combined_correct = pd.concat([df_arity, df], ignore_index=True)
        df_combined_correct.to_csv("data/asia_dataset_variable_"+str(rep)+".csv", index=False)


        # Combine the arity information (first row) with the rest of the dataset
        df_combined_correct = pd.concat([df_arity, df])
        # Save the DataFrame to a CSV file for inspection or future use
        df_arity.to_csv("data/asia_dataset_variable_"+str(rep)+".csv", index=False)
        # Append the arity row (df_arity) on top of the data_samples
        df_combined = pd.concat([df_arity, df], ignore_index=True)

        # Save the combined dataset to a CSV file
        df_combined.to_csv("data/asia_dataset_variable_"+str(rep)+".dat", index=False, sep=' ')


        
        import subprocess

        # Define the command as a list
        command = [
            "python3",
            "./pygobnilp-1.0/rungobnilp.py",
            "./data/asia_dataset_variable_"+str(rep)+".dat",
            "--output_scores", "data/asia_scores_variable_"+str(rep)+".jkl",
            "--score", "BDeu",
            "--nopruning",
            "--end", "local scores"
        ]

        # Specify the output file
        output_file = "command_output.txt"

        # Run the command and write its output to a file
        with open(output_file, "w") as file:
            try:
                result = subprocess.run(command, check=True, text=True, stdout=file, stderr=subprocess.PIPE)
                print("Command executed successfully! Output written to", output_file)
            except subprocess.CalledProcessError as e:
                print("An error occurred while executing the command.")
                print("Error message:", e.stderr)

        # 3) If your candidate parent heuristic depends on data, compute it here:
       

        # 1. Parse Gobnilp output
        parsed_scores = parse_gobnilp_jkl("data/asia_scores_variable_"+str(rep)+".jkl")
        scores = GobnilpScores(parsed_scores)

        candidate_parents = top_heuristic(scores,5)

        sampled_dags = generate_sampled_dags(rep)
  
        # 5) Compute coverage fraction for this data’s DAGs
        fraction = coverage_fraction(candidate_parents, sampled_dags)
        coverage_values.append(fraction)

    # 6) Average coverage fraction across all data draws
    return np.mean(coverage_values)

###########################
# 3. Demo Heuristic / DAG Learning
###########################

# Suppose your “Back-forth candidates” mean that for node i,
# the candidate parents are exactly this set.




def coverage_fraction(candidate_parents, sampled_dags):
    """
    Returns the fraction of sampled DAGs whose parent sets
    are all subsets of candidate_parents[node].
    """
    count_covered = 0
    total = len(sampled_dags)
    
    for dag in sampled_dags:
        covered = True
        for node, parents in dag.items():
            # If the DAG's parents for this node go beyond what
            # the candidate set allows, it's not covered
            if not parents.issubset(candidate_parents[node]):
                covered = False
                break
        if covered:
            count_covered += 1
    
    return count_covered / total if total > 0 else 0.0



def generate_sampled_dags(rep):
    """
    Example DAG-learning function that returns:
       1) an empty DAG, plus
       2) a random DAG with one parent assigned arbitrarily.

    Replace with Gobnilp, R MCMC, GES, or any real pipeline.
    """
    import subprocess

    # Define the command as a list of arguments
    command = ["./modular-dag-sampling-master/sampler", "nonsymmetric", "data/asia_scores_variable_"+str(rep)+".jkl", "10000"]

    # Specify the output file
    output_file = "data/asia_sampled_variable_"+str(rep)+".txt"

    # Run the command and write its output to the file
    with open(output_file, "w") as file:
        try:
            result = subprocess.run(command, check=True, text=True, stdout=file, stderr=subprocess.PIPE)
            print("Command executed successfully! Output written to", output_file)
        except subprocess.CalledProcessError as e:
            print("An error occurred while executing the command.")
            print("Error message:", e.stderr)
            
            
    def parse_dag_line(line: str) -> dict:
    
        
        dag = {}
        
        # 1) Split by "}," to separate each node's parent specification
        chunks = line.split("},")
        
        for chunk in chunks:
            chunk = chunk.strip()
            if not chunk:
                continue  # skip empty pieces (can happen if line ends with "},")
            
            # 2) Ensure the chunk ends with "}" if it doesn't already
            if not chunk.endswith("}"):
                chunk += "}"
            
            # Now chunk should look like, for example: "0 <- {}", or "1 <- {3}"
            if "<-" not in chunk:
                # If there's no "<-", it's not valid for our parser;
                # skip or raise an error, as you prefer.
                continue
            
            node_str, parents_str = chunk.split("<-")
            node_str = node_str.strip()        # e.g. "0"
            parents_str = parents_str.strip()  # e.g. "{3}" or "{}"
            
            # 3) Remove the outer braces from parents_str
            if parents_str.startswith("{"):
                parents_str = parents_str[1:]
            if parents_str.endswith("}"):
                parents_str = parents_str[:-1]
            
            # 4) Parse the parent nodes inside the braces
            parents_str = parents_str.strip()
            if parents_str:
                # e.g. "1, 3" -> ["1", "3"] -> {1, 3}
                parent_list = [p.strip() for p in parents_str.split(",") if p.strip()]
                parents = set(int(p) for p in parent_list)
            else:
                parents = set()
            
            # Convert node_str to int (assuming your node labels are numeric)
            node = int(node_str)
            
            # Store in the DAG dict
            dag[node] = parents
        
        return dag

    def parse_dag_file(file_path: str) -> list:
        """
        Read the file line by line. Each line describes one DAG,
        and parse it into a dict: { node: set_of_parents, ... }.
        
        Returns a list of such dicts, e.g. [dag1, dag2, dag3, ...].
        """
        all_dags = []
        with open(file_path, 'r') as f:
            for line_no, line in enumerate(f, start=1):
                line = line.strip()
                if not line:
                    continue  # skip empty lines

                dag = parse_dag_line(line)
                if dag:
                    # If your line is valid, append it
                    all_dags.append(dag)
                else:
                    # You can decide how to handle an empty / invalid parse
                    print(f"Warning: line {line_no} didn't parse to a DAG, skipping.")
        
        return all_dags



    filename = "data/asia_sampled_variable_"+str(rep)+".txt"
    sampled_dags = parse_dag_file(filename)
    return sampled_dags
    

            
    


###########################
# 4. Run the Test
###########################
if __name__ == "__main__":
 

    # We do 3 repeated draws, each time sampling 500 data points
    n_reps = 50
    n_samples_data = 500

    # Call the repeated pipeline
    avg_coverage = estimate_expected_coverage(
        n_reps=n_reps,
        n_samples_data=n_samples_data,

    )

    print("\n=== Result ===")
    print("Estimated coverage across multiple data draws:", avg_coverage)


[bnlearn] >Import <asia>
[bnlearn] >Loading bif file </home/gulce/.pyenv/versions/3.11.7/lib/python3.11/site-packages/datazets/data/asia.bif>
[bnlearn] >Check whether CPDs sum up to one.
=== Data draw #1 of 50 ===
['0', '1', '2', '3', '4', '5', '6', '7']
Command executed successfully! Output written to command_output.txt
Command executed successfully! Output written to data/asia_sampled_variable_0.txt
=== Data draw #2 of 50 ===
['0', '1', '2', '3', '4', '5', '6', '7']
Command executed successfully! Output written to command_output.txt
Command executed successfully! Output written to data/asia_sampled_variable_1.txt
=== Data draw #3 of 50 ===
['0', '1', '2', '3', '4', '5', '6', '7']
Command executed successfully! Output written to command_output.txt
Command executed successfully! Output written to data/asia_sampled_variable_2.txt
=== Data draw #4 of 50 ===
['0', '1', '2', '3', '4', '5', '6', '7']
Command executed successfully! Output written to command_output.txt
Command executed succes