In [1]:
import dowhy
from dowhy import CausalModel
import itertools
import glob
import os
# from rpy2.robjects import r as R
# %load_ext rpy2.ipython
import sqlite3
from cdt.causality.pairwise import CDS
import numpy as np
import pandas as pd
import graphviz
import networkx as nx
import itertools
from graphviz import Source
from spellchecker import SpellChecker
np.set_printoptions(precision=3, suppress=True)
np.random.seed(0)

Detecting 1 CUDA device(s).


In [2]:
def gen_graph_complete(df):
    df = df.replace(np.nan,'')
    graph = nx.DiGraph()
    for idx, row in df.iterrows():
        obj = 'primary_concept_'+row['primary_concept']
        shape = 'shape_'+row['shapes']
        color = 'color_'+row['colors']
        label = 'y'
        obj = obj.lower()
        color = color.lower()
        graph.add_edge(obj,label)
        graph.add_edge(obj,color)
        graph.add_edge(color,label)
        if shape != 'shape_':
            shape = shape.lower()
            graph.add_edge(obj,shape)
            graph.add_edge(shape,label) 

    return graph

In [3]:
# def gen_graph(df):
#     graph = nx.DiGraph()
#     for idx, row in df.iterrows():
#         obj = 'primary_concept_'+row['primary_concept']
#         color = 'color_'+row['colors']
#         label = 'y'
#         obj = obj.lower()
#         color = color.lower()
#         graph.add_edge(obj,label)
#         graph.add_edge(color,label)
#         graph.add_edge(obj,color)
#     return graph

In [10]:
def get_data(df):
#     df['part'] = df['part'].replace(np.nan,'')
#     df['primary_concept'] =  df['part']+ ','+df['primary_concept']
#     df['primary_concept'] = df['primary_concept'].str.strip(',')
#     df['primary_concept'] = df['primary_concept'].replace(',','_of_',regex=True)
    # df['shapes'] = df['shapes'].replace(np.nan,'')
    # df['primary_concept'] = df['primary_concept']+ ',' + df['shapes']
    # df['primary_concept'] = df['primary_concept'].str.strip(',')
    # df['primary_concept'] = df['primary_concept'].replace(',','_of_shape_',regex=True)
    
    df = df.drop(columns=['part'])
    temp = df.replace('',np.nan)
    # df.agg(lambda x: x.dropna().tolist(), axis=1)
    data = pd.get_dummies(temp, prefix=['primary_concept','color','shape'], columns=['primary_concept','colors','shapes'])
    data['y'] = pd.to_numeric(data['y'])
    return data, df

In [11]:
def causal_inference(data, filename_dot, G):
    results = []
    for col in G.nodes():
        if col =='y' or col=='\\n':
            continue
        # print(col)
        model=CausalModel(
                data = data,
                treatment=col,
                outcome='y',
                graph=filename_dot,
                missing_nodes_as_confounders=False
        )
        identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
        estimate = model.estimate_effect(identified_estimand,
                                        method_name="backdoor.linear_regression",
                                        control_value=0,
                                        treatment_value=1,
                                        confidence_intervals=False,
                                        test_significance=False)

        tmp = {'treatment':col,'score':str(estimate.value)}
        # print('values',tmp)
        results.append(tmp)
    newlist = sorted(results, key=lambda d: d['score'],reverse=True)
    res = pd.DataFrame.from_dict(newlist)
#     res.to_csv(filename_csv,index=False)
    return res

In [12]:
def causal_mediation(data, filename_dot, G):
    results = []
    for col in G.nodes():
        if col =='y' or col=='\\n':
            continue
        tmp = {}
        # print(col)
        tmp['primary_concept'] = col
        model=CausalModel(
                data = data,
                treatment=col,
                outcome='y',
                graph=filename_dot,missing_nodes_as_confounders=False)

        identified_estimand_nde = model.identify_effect(estimand_type="nonparametric-nde",
                                            proceed_when_unidentifiable=True)
        identified_estimand_nie = model.identify_effect(estimand_type="nonparametric-nie",
                                            proceed_when_unidentifiable=True)
        mediator = identified_estimand_nde.get_mediator_variables()
        
        causal_estimate_nde = model.estimate_effect(identified_estimand_nde,
                                        method_name="mediation.two_stage_regression",
                                        confidence_intervals=False,
                                        test_significance=False,
                                        method_params = {
                                            'first_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator,
                                            'second_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator
                                        }
                                        )
        tmp['nde'] = causal_estimate_nde.value
        if tmp['nde'] is None:
            continue
        med =mediator[0] if len(mediator)>0 else ''
        if len(mediator) > 0:
            tmp['mediator'] = med
            causal_estimate_nie = model.estimate_effect(identified_estimand_nie,
                                        method_name="mediation.two_stage_regression",
                                        confidence_intervals=False,
                                        test_significance=False,
                                        method_params = {
                                            'first_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator,
                                            'second_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator
                                        }
                                        )
            tmp['nie'] = causal_estimate_nie.value
            tmp['total'] = float(tmp['nie']) + float(tmp['nde'])
            tmp['mediation_proportion'] = float(tmp['nie'])/tmp['total']
            
        else:
            tmp['mediator'] = np.nan
            tmp['total'] = tmp['nde']
            tmp['mediation_proportion'] = np.nan
            tmp['nie'] = np.nan
        # print(tmp['total'])
        results.append(tmp)
    newlist = sorted(results, key=lambda d: d['total'],reverse=True)
    return results

In [13]:
for file_name in glob.glob('class_concepts_images/*.csv'):
    graph_file = str('bk_obj_color_shape/graphs/'+os.path.basename(file_name)[:-4] + '.dot')
    result_file = str('bk_obj_color_shape/effects/'+os.path.basename(file_name))
    print(result_file)
    if not glob.glob(result_file):
        result_file_mediation = str('bk_obj_color_shape/mediation/'+os.path.basename(file_name))
        print(graph_file)
        df = pd.read_csv(file_name)
        data, df1 = get_data(df)
        G = gen_graph_complete(df1)
#         G = gen_graph(df1)

        if not glob.glob(graph_file):
            nx.drawing.nx_pydot.write_dot(G,graph_file)
        res = causal_inference(data, graph_file, G)
        res_mediation = causal_mediation(data, graph_file, G)
        mr = pd.DataFrame(res_mediation)
        mr.replace('',np.nan, inplace=True)
        res.to_csv(result_file,index=False)
        mr.to_csv(result_file_mediation,index=False)
        print(graph_file,result_file)

bk_obj_color_shape/effects/squeezenet_imageneta_bias_ants.csv
bk_obj_color_shape/graphs/squeezenet_imageneta_bias_ants.dot


KeyboardInterrupt: 

In [9]:
df1.head()

Unnamed: 0,colors,y,id,primary_concept,shapes
0,brown,4.9e-05,1713,hive,
1,white,0.000575,1503,locust,
2,blue,0.000575,1503,locust,
3,darkgoldenrod,0.018543,1763,bee,
4,blue,0.018543,1763,bee,
