In [1]:
import os
import time
import re
import numpy as np
import torch
import pandas as pd
import random
from torch.distributions.binomial import Binomial
from torch.distributions.bernoulli import Bernoulli
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

from transformers import DataCollatorForLanguageModeling, BertForMaskedLM
from transformers import Trainer, TrainingArguments

from tokens import WordLevelBertTokenizer
from vocab import create_vocab
from data import CausalBertDataset, MLMDataset
from causal_bert import CausalBert

In [2]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '3'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Asthma codes

J4520   Mild intermittent asthma, uncomplicated

J4521   Mild intermittent asthma with (acute) exacerbation

J4522   Mild intermittent asthma with status asthmaticus

J4530   Mild persistent asthma, uncomplicated

J4531   Mild persistent asthma with (acute) exacerbation

J4532   Mild persistent asthma with status asthmaticus

J4540   Moderate persistent asthma, uncomplicated

J4541   Moderate persistent asthma with (acute) exacerbation

J4542   Moderate persistent asthma with status asthmaticus

J4550   Severe persistent asthma, uncomplicated

J4551   Severe persistent asthma with (acute) exacerbation

J4552   Severe persistent asthma with status asthmaticus

J45901  Unspecified asthma with (acute) exacerbation

J45902  Unspecified asthma with status asthmaticus

J45909  Unspecified asthma, uncomplicated

J45991  Cough variant asthma

J45998  Other asthma

T486X1A Poisoning by antiasthmatics, accidental (unintentional), initial encounter

T486X1D Poisoning by antiasthmatics, accidental (unintentional), subsequent encounter

T486X1S Poisoning by antiasthmatics, accidental (unintentional), sequela

T486X2A Poisoning by antiasthmatics, intentional self-harm, initial encounter

T486X2D Poisoning by antiasthmatics, intentional self-harm, subsequent encounter

T486X2S Poisoning by antiasthmatics, intentional self-harm, sequela

T486X3A Poisoning by antiasthmatics, assault, initial encounter

T486X3D Poisoning by antiasthmatics, assault, subsequent encounter

T486X3S Poisoning by antiasthmatics, assault, sequela

T486X4A Poisoning by antiasthmatics, undetermined, initial encounter

T486X4D Poisoning by antiasthmatics, undetermined, subsequent encounter

T486X4S Poisoning by antiasthmatics, undetermined, sequela

T486X5A Adverse effect of antiasthmatics, initial encounter

T486X5D Adverse effect of antiasthmatics, subsequent encounter

T486X5S Adverse effect of antiasthmatics, sequela

T486X6A Underdosing of antiasthmatics, initial encounter

T486X6D Underdosing of antiasthmatics, subsequent encounter

T486X6S Underdosing of antiasthmatics, sequela

Z825    Family history of asthma and other chronic lower respiratory diseases


In [3]:
vocab = create_vocab(merged=True, uni_diag=True)
tokenizer = WordLevelBertTokenizer(vocab)

In [4]:
dataset = CausalBertDataset(tokenizer=tokenizer, data_type='merged', is_unidiag=True,
                            group=[9], max_length=512, min_length=10,
                            truncate_method='first', device=device)

In [5]:
Q1 = dataset.treatment * dataset.response + (1 - dataset.treatment) * dataset.pseudo_response
Q1 = Q1.cpu().data.numpy().squeeze()

Q0 = dataset.treatment * dataset.pseudo_response + (1 - dataset.treatment) * dataset.response
Q0 = Q0.cpu().data.numpy().squeeze()

treatment = dataset.treatment.cpu().data.numpy().squeeze()
prop_score = dataset.prop_score.cpu().data.numpy().squeeze()

In [6]:
data = pd.DataFrame([prop_score, treatment, Q1, Q0]).T
data.columns = ['propensity_scores', 'treatment', 'potential_outcome_1', 'potential_outcome_0']

In [7]:
data

Unnamed: 0,propensity_scores,treatment,potential_outcome_1,potential_outcome_0
0,0.2,0.0,1.0,0.0
1,0.2,0.0,1.0,0.0
2,0.2,0.0,1.0,0.0
3,0.2,1.0,1.0,1.0
4,0.2,1.0,1.0,0.0
...,...,...,...,...
191765,0.2,0.0,1.0,0.0
191766,0.2,0.0,1.0,0.0
191767,0.2,0.0,1.0,0.0
191768,0.2,0.0,0.0,1.0


In [8]:
data["propensity_scores"].max()

0.800000011920929

In [10]:
SATE = (data["potential_outcome_1"]-data["potential_outcome_0"]).mean()
SATE

0.05901340147051155