In [1]:
import dowhy
from dowhy import CausalModel
import itertools
import glob
import os
from rpy2.robjects import r as R
%load_ext rpy2.ipython
import sqlite3
from cdt.causality.pairwise import CDS
import numpy as np
import pandas as pd
import graphviz
import networkx as nx
import itertools
from graphviz import Source
from spellchecker import SpellChecker
np.set_printoptions(precision=3, suppress=True)
np.random.seed(0)

In [6]:
def pairwise(inp):
    for pair in itertools.combinations(inp.items(), 2):
        yield dict(pair)

In [2]:
def get_data(df, include_part=False, include_shape=False):
    if include_part:
        df['part'] = df['part'].replace(np.nan,'')
        df['primary_concept'] =  df['part']+ ','+df['primary_concept']
        df['primary_concept'] = df['primary_concept'].str.strip(',')
        df['primary_concept'] = df['primary_concept'].replace(',','_of_',regex=True)
    
    df = df.drop(columns=['part'])
    temp = df.replace('',np.nan)
    if include_shape:
        data = pd.get_dummies(temp, prefix=['primary_concept','color','shape'], columns=['primary_concept','colors','shapes'])
    else:
        data = pd.get_dummies(temp, prefix=['primary_concept','color'], columns=['primary_concept','colors'])
        
    data['y'] = pd.to_numeric(data['y'])
    return data, df

In [3]:
def gen_graph(df):
    g = nx.DiGraph()
    for idx, row in df.iterrows():
        n1 = row['pair1']
        n2 = row['pair2']
        if row['causal'] > 0:
            if n1 == 'y':
                # print('y>0')
                g.add_edge(n2,n1)
            else:
                g.add_edge(n1,n2)
        elif row['causal'] < 0:
            if n2 == 'y':
                print('y<0')
                g.add_edge(n1,n2)
            else:
                g.add_edge(n2,n1)
    return g

In [4]:
def causal_inference(data, filename_dot, G):
    results = []
    for col in G.nodes():
        if col =='y' or col=='\\n':
            continue
        # print(col)
        model=CausalModel(
                data = data,
                treatment=col,
                outcome='y',
                graph=filename_dot,
                missing_nodes_as_confounders=False
        )
        identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
        estimate = model.estimate_effect(identified_estimand,
                                        method_name="backdoor.linear_regression",
                                        control_value=0,
                                        treatment_value=1,
                                        confidence_intervals=False,
                                        test_significance=False)

        tmp = {'treatment':col,'score':str(estimate.value)}
        # print('values',tmp)
        results.append(tmp)
    newlist = sorted(results, key=lambda d: d['score'],reverse=True)
    res = pd.DataFrame.from_dict(newlist)
    return res

In [23]:
def causal_mediation(data, filename_dot, G):
    results = []
    for col in G.nodes():
        if col =='y' or col=='\\n':
            continue
        tmp = {}
        # print(col)
        tmp['primary_concept'] = col
        model=CausalModel(
                data = data,
                treatment=col,
                outcome='y',
                graph=filename_dot,missing_nodes_as_confounders=False)

        identified_estimand_nde = model.identify_effect(estimand_type="nonparametric-nde",
                                            proceed_when_unidentifiable=True)
        identified_estimand_nie = model.identify_effect(estimand_type="nonparametric-nie",
                                            proceed_when_unidentifiable=True)
        mediator = identified_estimand_nde.get_mediator_variables()
        
        causal_estimate_nde = model.estimate_effect(identified_estimand_nde,
                                        method_name="mediation.two_stage_regression",
                                        confidence_intervals=False,
                                        test_significance=False,
                                        method_params = {
                                            'first_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator,
                                            'second_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator
                                        }
                                        )
        tmp['nde'] = causal_estimate_nde.value
        if tmp['nde'] is None:
            continue
        med =mediator[0] if len(mediator)>0 else ''
        if len(mediator) > 0:
            tmp['mediator'] = med
            causal_estimate_nie = model.estimate_effect(identified_estimand_nie,
                                        method_name="mediation.two_stage_regression",
                                        confidence_intervals=False,
                                        test_significance=False,
                                        method_params = {
                                            'first_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator,
                                            'second_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator
                                        }
                                        )
            tmp['nie'] = causal_estimate_nie.value
            tmp['total'] = float(tmp['nie']) + float(tmp['nde'])
            tmp['mediation_proportion'] = float(tmp['nie'])/tmp['total']
            
        else:
            tmp['mediator'] = np.nan
            tmp['total'] = tmp['nde']
            tmp['mediation_proportion'] = np.nan
            tmp['nie'] = np.nan
        # print(tmp['total'])
        results.append(tmp)
    newlist = sorted(results, key=lambda d: d['total'],reverse=True)
    return results

In [None]:
for file_name in glob.glob('class_concepts_images/*.csv'):
    graph_file = str('cds_obj_color_shape/graphs/'+os.path.basename(file_name)[:-4] + '.dot')
    result_file = str('cds_obj_color_shape/effects/'+os.path.basename(file_name))
    print(result_file)
    if not glob.glob(result_file):
        
        print(graph_file)
        df = pd.read_csv(file_name)
        data, _ = get_data(df)
        t = list(pairwise(data))
        tmp = pd.DataFrame([(list(i.items())[0][0],list(i.items())[1][0],np.array(list(i.items())[0][1]),np.array(list(i.items())[1][1])) for i in t], 
                       columns=['pair1','pair2','A','B'])

        obj = CDS()
        output = obj.predict(tmp[['A','B']])
        tmp['causal'] = output
        tmp = tmp[tmp.causal!=0]
        samples = int(len(tmp)*0.25)
        sliced_df = pd.concat([tmp.head(samples), tmp.tail(samples)])
        G = gen_graph(sliced_df)
        if not glob.glob(graph_file):
            nx.drawing.nx_pydot.write_dot(G,graph_file)
        res = causal_inference(data, graph_file, G)
        res.to_csv(result_file,index=False)
    print(graph_file,result_file)