In [1]:
import networkx as nx
import pandas as pd
import sys
from pgmpy.estimators import ConstraintBasedEstimator
sys.path.append('../python/structure_learning/constraint_based/')
from mdepcs import MDEPCS
from mmpc import MMPC
from pc import pcalg
sys.path.append('../python/structure_learning/score_based/')
from hc import hill_climb
from fges import fges
sys.path.append('../python/')
from scores import *
from ci_tests import *
from discretize import *
from bnutils import *
sys.path.append('../libraries/')
from caim_test import get_caim
import warnings
warnings.filterwarnings("ignore")

In [5]:
######################################################   INPUT    ######################################################
data = pd.read_pickle('/home/code-base/gitrepo/Mixed/python/mixed_data/10nodes13edges/data_0.gpickle')
max_categories = 20
constraint_based = MDEPCS  # MDEPCS(default- works with fcit only), MMPC
algo = 'hybrid' #'hybrid'(default), 'constraint', 'score'
# disc = 'quantile' #For univariate only(quantile(default), uniform, kmeans)
# bins = 5 #For univariate only(quantile(default), uniform, kmeans)
alpha = 0.1 # (0.05 for PC, MMPC) (0.1 for MDEPCS only)
ci_test = fast_conditional_ind_test # fast_conditional_ind_test(default), chi_square_test
verbose = False
max_processes = 40
discretizer = Data_Driven_Discretizer #Data_Driven_Discretizer(For CAIM, LAIM, AMEVA), unsupervised_discretization
disc_score = get_laim #get_mlameva(default), get_laim, get_caim, get_ameva
search_and_score = fges #For hybrid and score only,(fges(default)), hill_climb)
score = bdeu_score #For hybrid only(bdeu_score, bic_score_discrete)
result_score = bic_score_discrete
custom = True  ## flag to specify type through cont_list or autoselect column types if false
cont_columns = [] ## specify list of continuous columns here

In [6]:
def runner(data):
    """Runner for different types of algo
    
    Returns:
        dag : the result graph from the algo
        data : discretized data
        score of the graph
    """
    ######################################################    DATA PREPROCESSING    ######################################################
    
    nodes = []
    args = nx.DiGraph()
    mappers = column_mapping(data)
    data.rename(columns = mappers[0],inplace=True)
    args.add_nodes_from(list(data.columns))
    if not custom:
        for col in data.columns:
            categories = len(pd.Series.unique(data[col]))
            if(categories > max_categories):
                args.nodes[col]['type'] = 'cont'
                args.nodes[col]['num_categories'] = 'NA'
            else:
                args.nodes[col]['type'] = 'disc'
                args.nodes[col]['num_categories'] = categories
                data = data.replace({col: pd.unique(data[col])}, {col: list(range(pd.unique(data[col]).shape[0]))})        
    else:
        for col in data.columns:
            categories = len(pd.Series.unique(data[col]))
            if mappers[1][col] in cont_columns:
                args.nodes[col]['type'] = 'cont'
                args.nodes[col]['num_categories'] = 'NA'
            else:
                args.nodes[col]['type'] = 'disc'
                args.nodes[col]['num_categories'] = categories
                data = data.replace({col: pd.unique(data[col])}, {col: list(range(pd.unique(data[col]).shape[0]))})        
    
    CBE = ConstraintBasedEstimator(data)
    
    if algo == 'constraint':
        dag = CBE.pdag_to_dag(pcalg(data, args.nodes(data=True), alpha, ci_test, disc, bins))
        data.rename(columns = mappers[1], inplace = True)
        nx.relabel_nodes(dag, mappers[1], copy=False)
        return (dag, data, result_score(dag, data))
    elif algo == 'score':
        dag = CBE.pdag_to_dag(fges(data, args.nodes(data=True), disc = disc, n_bins = bins, score = score))
        data.rename(columns = mappers[1], inplace = True)
        nx.relabel_nodes(dag, mappers[1], copy=False)
        return (dag, data, result_score(dag, data))
    
    ######################################################   SKELETON LEARNING   ######################################################
    disc_data = None
    if discretizer == unsupervised_discretization:
        disc_data = discretizer(data.copy(), [node[0] for node in args.nodes(data=True) if node[1]['type'] == 'cont'], bins, disc)
    
    if constraint_based == MDEPCS: 
        skel = constraint_based(data, args.nodes(data=True), alpha, ci_test, verbose, max_processes).mdepcs()
    elif constraint_based == MMPC:
        skel = constraint_based(data, args.nodes(data=True), alpha, ci_test, verbose, max_processes).mmpc()
    ######################################################   DISCRETIZATION   ######################################################
    
    if discretizer == Data_Driven_Discretizer:
        disc_data = discretizer(data.copy(), skel, args.nodes(data=True), alpha, max_processes, method=disc_score).discretize()
        
    ######################################################   SEARCH AND SCORE   ######################################################
    dag = CBE.pdag_to_dag(search_and_score(disc_data, args.nodes(data=True), score = score))
    disc_data.rename(columns = mappers[1],inplace=True)
    nx.relabel_nodes(dag, mappers[1], copy=False)
    return (dag, disc_data, result_score(dag, disc_data))
        

In [None]:
result = runner(data)

Categorical []
# 0  GLOBAL CAIM  0.05354955699322432
Categorical []
# 0  GLOBAL CAIM  0.028294468085986825
Categorical []
# 0  GLOBAL CAIM  0.05579460993589193
Categorical []
# 0  GLOBAL CAIM  0.03037694369949622
Categorical []
# 0  GLOBAL CAIM  0.053089644498094994


In [9]:
print(len(result[0].edges))

13
