In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
import dowhy

In [3]:
from dowhy.do_why import CausalModel
import dowhy.datasets

In [4]:
data = dowhy.datasets.linear_dataset(
    beta=10,
    num_common_causes=3,
    num_instruments=2,
    num_samples=10000,
    treatment_is_binary=True)

In [5]:
data.keys()

dict_keys(['df', 'treatment_name', 'outcome_name', 'common_causes_names', 'instrument_names', 'dot_graph', 'gml_graph', 'ate'])

In [6]:
data['treatment_name']

'v'

In [7]:
data['outcome_name']

'y'

In [8]:
data['common_causes_names']

['X0', 'X1', 'X2']

In [9]:
data['df'].head()

Unnamed: 0,Z0,Z1,X0,X1,X2,v,y
0,0.0,0.646023,2.47734,-0.591143,-1.35892,1.0,16.478453
1,0.0,0.289819,0.351553,-2.542156,-0.976012,0.0,-8.44771
2,0.0,0.557476,1.15706,-0.214114,-1.121894,1.0,12.617602
3,0.0,0.726191,-0.064254,1.870559,-0.614645,0.0,5.323792
4,0.0,0.114757,0.461602,-0.031149,-0.474104,0.0,1.195367


In [10]:
data["treatment_name"]

'v'

In [None]:
data["outcome_name"]

In [None]:
model = CausalModel(
    data=data["df"],
    treatment=data["treatment_name"],
    outcome=data["outcome_name"],
    graph=data["gml_graph"])

In [None]:
identified_estimand = model.identify_effect()

In [None]:
estimate = model.estimate_effect(identified_estimand,
                                 method_name="backdoor.propensity_score_matching")

In [None]:
estimate.params.keys()

In [None]:
estimate.params['estimand_type']

In [None]:
estimate.params['estimator_class']

In [None]:
# Refute the obtained estimate using multiple robustness checks.
refute_results = model.refute_estimate(identified_estimand, estimate,
                                       method_name="random_common_cause")

In [None]:
model.view_model()

## Load Datasets

In [None]:
!ls -lah ../../../datasets/doleta

In [16]:
dataset_path = '../../../datasets/doleta/'

In [19]:
dict_cols = { "EMPLOYER_NAME": "I",
                  "EMPLOYER_ADDRESS_1": "J",
                  "EMPLOYER_CITY" : "L",
                  "EMPLOYER_STATE": "M",
                  "EMPLOYER_NUM_EMPLOYEES": "R",
                  "EMPLOYER_YR_ESTAB" : "S",
                  "PW_SOC_CODE":  "Y",
                  "PW_SOC_TITLE": "Z",
                  "PW_LEVEL_9089": "AA",
                  "PW_AMOUNT_9089": "AB",
                  "PW_UNIT_OF_PAY_9089": "AC",
                  "WAGE_OFFER_FROM_9089": "AH",
                  "WAGE_OFFER_TO_9089": "AI",
                  "JOB_INFO_WORK_CITY": "AK",
                  "JOB_INFO_WORK_STATE": "AL",
                  "JOB_INFO_JOB_TITLE": "AN",
                  "JOB_INFO_EDUCATION": "AO",
                  "JOB_INFO_MAJOR": "AQ",
                  "COUNTRY_OF_CITIZENSHIP": "DD",
                  "FOREIGN_WORKER_INFO_EDUCATION": "DG",
                  "FOREIGN_WORKER_INFO_MAJOR": "DI",
                  "FW_INFO_YR_REL_EDU_COMPLETED": "DJ",
                  "EMPLOYER_DECL_INFO_TITLE": "DR",
                  "NAICS_US_CODE": "DS",
                  "PW_JOB_TITLE_9089": "DU"
            }

In [22]:
selected_cols = list(dict_cols.keys())
selected_cols

['EMPLOYER_NAME',
 'EMPLOYER_ADDRESS_1',
 'EMPLOYER_CITY',
 'EMPLOYER_STATE',
 'EMPLOYER_NUM_EMPLOYEES',
 'EMPLOYER_YR_ESTAB',
 'PW_SOC_CODE',
 'PW_SOC_TITLE',
 'PW_LEVEL_9089',
 'PW_AMOUNT_9089',
 'PW_UNIT_OF_PAY_9089',
 'WAGE_OFFER_FROM_9089',
 'WAGE_OFFER_TO_9089',
 'JOB_INFO_WORK_CITY',
 'JOB_INFO_WORK_STATE',
 'JOB_INFO_JOB_TITLE',
 'JOB_INFO_EDUCATION',
 'JOB_INFO_MAJOR',
 'COUNTRY_OF_CITIZENSHIP',
 'FOREIGN_WORKER_INFO_EDUCATION',
 'FOREIGN_WORKER_INFO_MAJOR',
 'FW_INFO_YR_REL_EDU_COMPLETED',
 'EMPLOYER_DECL_INFO_TITLE',
 'NAICS_US_CODE',
 'PW_JOB_TITLE_9089']

In [29]:
perm_2019 = perm_2019 = pd.read_excel(dataset_path+ 'PERM_Disclosure_Data_FY2019.xlsx')

In [60]:
x = 50

In [66]:
for i in range(x,x+10):
    print(float(perm_2019['WAGE_OFFER_FROM_9089'].loc[i].replace(",","")))
    print(perm_2019['JOB_INFO_JOB_TITLE'].loc[i])
    print(int(perm_2019['EMPLOYER_NUM_EMPLOYEES'].loc[i]))
    print(perm_2019['EMPLOYER_NAME'].loc[i])
    print(perm_2019['EMPLOYER_CITY'].loc[i])
    print(perm_2019['JOB_INFO_WORK_CITY'].loc[i])
    print(perm_2019['JOB_INFO_WORK_STATE'].loc[i])
    print(perm_2019['PW_JOB_TITLE_9089'].loc[i])
    print(int(perm_2019['FW_INFO_YR_REL_EDU_COMPLETED'].loc[i]))
    print(perm_2019['FOREIGN_WORKER_INFO_EDUCATION'].loc[i])
    print("---"*10)

124000.0
ASIC Engineer
6400
NVIDIA CORPORATION
SANTA CLARA
Santa Clara
CALIFORNIA
Electronics Engineers, Except Computer
2016
Master's
------------------------------
91000.0
Business Analyst
135
INTONE NETWORKS INC
ISELIN
Iselin
NEW JERSEY
Computer Systems Analysts
2008
Master's
------------------------------
130000.0
Architect
6400
NVIDIA CORPORATION
SANTA CLARA
Santa Clara
CALIFORNIA
Electronics Engineers, Except Computer
2015
Master's
------------------------------
70325.0
FINANCIAL ANALYST
5
ESSCALA TRADING, LLC
BUDD LAKE
BUDD LAKE
NEW JERSEY
ACCOUNTANTS AND AUDITORS
2006
Bachelor's
------------------------------
237500.0
Manager, Senior Hardware Engineer
6400
NVIDIA CORPORATION
SANTA CLARA
Santa Clara
CALIFORNIA
Architectural and Engineering Managers
2009
Master's
------------------------------
38298.0
Resource Specialist
4000
LODI UNIFIED SCHOOL DISTRICT
LODI
Lodi
CALIFORNIA
Special Education Teachers, Secondary School
1990
Bachelor's
------------------------------
155000.0
Sr. M

In [70]:
dataframe = perm_2019['WAGE_OFFER_FROM_9089']

In [71]:
dataframe

0         32,053.00
1         77,459.00
2         41,746.00
3         15,579.00
4             15.33
5             12.83
6         47,154.00
7         83,100.00
8         42,245.00
9         42,245.00
10        43,437.00
11       109,100.00
12        69,805.00
13        94,890.00
14            35.00
15            34.00
16        98,556.88
17        72,000.00
18            35.00
19        91,500.00
20        38,460.00
21            19.00
22        85,600.00
23        70,000.00
24        74,900.00
25        89,150.00
26       107,000.00
27        91,478.40
28        75,649.60
29       100,713.60
            ...    
77845    157,706.00
77846     30,217.00
77847    101,150.00
77848    101,005.00
77849         40.00
77850     99,382.00
77851         12.25
77852     28,080.00
77853     21,840.00
77854         12.25
77855     40,227.00
77856     66,500.00
77857     55,000.00
77858     80,000.00
77859     29,433.00
77860     20,218.00
77861     99,000.00
77862     37,170.00
77863     37,648.00
