In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import dowhy
from scipy import stats
from dowhy import CausalModel
from collections import Counter
import networkx as nx
import fileinput
import numpy as np
import pandas as pd
import random
import math
import networkx as nx
from scipy.special import softmax
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from sklearn.linear_model import LassoCV
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LassoCV
from econml.inference import BootstrapInference
plt.rcParams['figure.figsize'] = [15, 10]
plt.style.use("default")
from sklearn.linear_model import LinearRegression

## We generate the data and create custom relationship between the label and collected annotation
# In this case we create the following equations- 
*** 
$ bed=p^k(1-p)^k $ <br>

$black = bed * \mathcal{N}(0.5,0.1) +\xi $<br>

$shape = bed * \mathcal{N}(0.5,0.1) + \xi$

$y = \beta_1*bed + \beta_2*shape + \beta_3*color +\xi$
***

In [None]:
plt.style.use("default")
plt.rcParams['figure.figsize'] = [15, 10]
model.view_model()

In [2]:
def generate_data(binarize=False):
    np.random.seed(1223)
    sample_size = 50
    beta1=3
    beta2=1.5
    beta3=5
    # beta4 = 1.3
    alpha1=0.01
    alpha2=0.02
    alpha3=0.005
    #binary values for bed 0 - absent, 1 - present
    bed = np.random.choice([0, 1], size=(sample_size,))
    # binary values for shape 0 - absent, 1 - present
    rect = bed * np.random.choice([0, 1], size=(sample_size,)) + np.random.normal(0.5,0.1,size=sample_size)
    # binary values for shape 0 - absent, 1 - present
    black = bed * np.random.uniform(0, 1, sample_size) + np.random.normal(0.5,0.1,size=sample_size)
    # mean of all pixel values with a high weightage of  error to account for other parameters
    # img_mean = alpha1 * bed + alpha2 * rect + alpha3 * black +  3 * np.random.normal(0.5,0.1,size=sample_size)
    y = beta1 * bed + beta2 * rect + beta3 * black + np.random.normal(0.5,0.1,size=sample_size)
    # y = np.maximum(y, 0)
    # y = [sigmoid(x) for x in y]
    df = pd.DataFrame({'obj_bed':bed,'color_black':black, 'shape_rectangle':rect, 'label_bedroom':y})
    if binarize:
        df['label_bedroom'] = (df['label_bedroom'] < df['label_bedroom'].mean() ).astype(int)
    return df

In [3]:
df = generate_data(binarize=False)

In [4]:
df.describe()

Unnamed: 0,obj_bed,color_black,shape_rectangle,label_bedroom
count,50.0,50.0,50.0,50.0
mean,0.5,0.763972,0.792365,7.008801
std,0.505076,0.341555,0.462353,3.463681
min,0.0,0.378658,0.265815,3.116309
25%,0.0,0.499547,0.470658,3.819654
50%,0.5,0.624463,0.553981,5.540178
75%,1.0,0.988758,1.392142,10.252892
max,1.0,1.758368,1.603324,14.559708


## What happens when you try to conclude from annotated data

In [84]:
def estimate_causal_effect(Xt, y, model=LinearRegression(), treatment_idx=0, regression_coef=False):
    model.fit(Xt, y)
    if regression_coef:
        return model.coef_[treatment_idx]
    else:
        Xt1 = pd.DataFrame.copy(Xt)
        Xt1[Xt.columns[treatment_idx]] = 1 #set all values of treatment variable to 1
        Xt0 = pd.DataFrame.copy(Xt)
        Xt0[Xt.columns[treatment_idx]] = 0 #set all values of treatment variable to 0
        return (model.predict(Xt1) - model.predict(Xt0)).mean()


# The above function will be replaced by Anitej's method. The interventions (i.e replacing treatment variable with 1 or 0 will be done by generating images with or without the concept.
***
For example - 

1) We capture all annotations. Then we identify a single concept as treatment variable (our point of interest - could be that we want to establish the causal impact of the concept bed then we will treat the bed concept as a treatment variable). 

2) Then we keep the confounders same/similar (so color, shape etc and then remove the bed! we generate bunch of images without bed (do(bed) = 0) and with bed (do(bed) = 1) and capture the output of the model. Subtract the model predictions Y|do(bed=1) - Y|do(bed=0)


3) Now we do repeat it for other concepts, since we work on the assumption that everything is a confounder, given the nature of our study we can safely condition on all the variables (no collider bias)

## You see the bed as an annotation and conclude that bed causes the label bedroom

In [127]:
ate_est_naive = estimate_causal_effect(df[['obj_bed']], df['label_bedroom'], treatment_idx=0,
                                           regression_coef=False)
print('original estimation=3, predicted estimation='+str(ate_est_naive))

original estimation=3, predicted estimation=6.3638380600145945


## You see the bed as an annotation and conclude that color causes the label bedroom

In [131]:
ate_est_naive = estimate_causal_effect(df[['color_black']], df['label_bedroom'], treatment_idx=0,
                                           regression_coef=True)
print('original estimation=1.5, predicted estimation='+str(ate_est_naive))

original estimation=1.5, predicted estimation=9.193332749253038


## You see the bed as an annotation and conclude that shape causes the label bedroom

In [135]:
ate_est_naive = estimate_causal_effect(df[['shape_rectangle']], df['label_bedroom'], treatment_idx=0,
                                           regression_coef=True)
print('original estimation=5, predicted estimation='+str(ate_est_naive))

original estimation=5, predicted estimation=5.375885275977536


## Actual contribution of shape(1.5) when considering for all the confounders

In [134]:
ate_est_naive = estimate_causal_effect(df[['color_black','shape_rectangle','obj_bed']], df['label_bedroom'], treatment_idx=1,
                                           regression_coef=True)
print('original estimation=1.5, predicted estimation='+str(ate_est_naive))

original estimation=1.5, predicted estimation=1.524893983779829
