In [6]:
import networkx as nx
import pandas as pd
from pgmpy.estimators import ConstraintBasedEstimator
from python.structure_learning.hybrid.mmhc import mmhc
from python.scores import *
from python.ci_tests import *
from python.discretize import *
from python.bnutils import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
######################################################   INPUT    ######################################################
data = pd.read_pickle('/home/code-base/gitrepo/Mixed/python/mixed_data/10nodes13edges/data_0.gpickle')
max_categories = 20
disc = 'quantile' #For univariate only(quantile(default), uniform, kmeans)
bins = 5 #For univariate only(quantile(default), uniform, kmeans)
alpha = 0.05 # (0.05 for PC, MMPC) (0.1 for MDEPCS only)
ci_test = chi_square_test # fast_conditional_ind_test(default), chi_square_test
verbose = False
max_processes = 40
discretizer = unsupervised_discretization #Data_Driven_Discretizer(For CAIM, LAIM, AMEVA), unsupervised_discretization
score = bdeu_score #For hybrid only(bdeu_score, bic_score_discrete)
result_score = bic_score_discrete
custom = True  ## flag to specify type through cont_list or autoselect column types if false
cont_columns = [] ## specify list of continuous columns here

In [3]:
def runner(data):
    """Runner for different types of algo
    
    Returns:
        dag : the result graph from the algo
        data : discretized data
        score of the graph
    """
    ######################################################    DATA PREPROCESSING    ######################################################
    
    nodes = []
    args = nx.DiGraph()
    mappers = column_mapping(data)
    data.rename(columns = mappers[0],inplace=True)
    args.add_nodes_from(list(data.columns))
    if not custom:
        for col in data.columns:
            categories = len(pd.Series.unique(data[col]))
            if(categories > max_categories):
                args.nodes[col]['type'] = 'cont'
                args.nodes[col]['num_categories'] = 'NA'
            else:
                args.nodes[col]['type'] = 'disc'
                args.nodes[col]['num_categories'] = categories
                data = data.replace({col: pd.unique(data[col])}, {col: list(range(pd.unique(data[col]).shape[0]))})        
    else:
        for col in data.columns:
            categories = len(pd.Series.unique(data[col]))
            if mappers[1][col] in cont_columns:
                args.nodes[col]['type'] = 'cont'
                args.nodes[col]['num_categories'] = 'NA'
            else:
                args.nodes[col]['type'] = 'disc'
                args.nodes[col]['num_categories'] = categories
                data = data.replace({col: pd.unique(data[col])}, {col: list(range(pd.unique(data[col]).shape[0]))})        
    
    CBE = ConstraintBasedEstimator(data)
    
    data = discretizer(data, [node[0] for node in args.nodes(data=True) if node[1]['type'] == 'cont'], bins, disc)
    dag = CBE.pdag_to_dag(mmhc(data, args.nodes(data=True), alpha, ci_test, verbose, score = score, max_process = max_processes))
    data.rename(columns = mappers[1], inplace = True)
    nx.relabel_nodes(dag, mappers[1], copy=False)
    return (dag, data, result_score(dag, data))
        

In [4]:
result = runner(data)