In [1]:
import pandas as pd
import torch
import random
import os
from torch.distributions.binomial import Binomial
from torch.distributions.bernoulli import Bernoulli
import torch.nn as nn



In [58]:
def treat_portion(x):
    x = x.replace(' [SEP] ', '').strip()
    token_list = x.split(' ')
    treat_cnt = 0
    if len(token_list) > 0:
        for token in token_list:
            if 'diag' not in token:
                treat_cnt += 1
    
    score = (treat_cnt + 1e-8)/ (len(token_list) + 1e-8)
    score = max(score, 0.001)
    score = min(score, 0.999)

    return score

DATA_PATH = '/nfs/turbo/lsa-regier/emr-data/'
groups = [str(i) for i in range(10)]
beta_1 = torch.tensor([1.0, 10.0, 100.0])

for group in groups:
    data_path = os.path.join(DATA_PATH, f'group_{group}_merged.csv')
    data = pd.read_csv(data_path, sep=',')
    
    ### calculate propensity score pi(z)
    data = data.assign(base_propensity_score = data['document'].apply(lambda x: treat_portion(x)))
    
    ### simulate treatment based on propensity score
    all_treatments = Binomial(1, torch.tensor(data["base_propensity_score"].values))
    data = data.assign(treatment = all_treatments.sample())
    
    ### simulate response from treatment and propensity score
    treatment_propensity = 0.25 * data['treatment'] + beta_1[0] * (data['base_propensity_score'] - 0.2)
    softmax_fn = torch.nn.Softmax()
    input = torch.tensor(treatment_propensity.values)
    reponse_prob = softmax_fn(input)
    all_response = torch.bernoulli(reponse_prob)
    data = data.assign(response = all_response)
    break



In [59]:
data["response"].sum()

0.0

In [60]:
data["treatment"].sum()

22476.0

In [61]:
data


Unnamed: 0,patid,document,base_propensity_score,treatment,response
0,560499201141940,SIMVASTATIN METOPROLOL_SUCCINATE LISINOPRIL ES...,0.725971,1.0,0.0
1,560499201299620,icd:9_diag:7822 [SEP] icd:9_diag:V700 icd:9_di...,0.001000,0.0,0.0
2,560499202033650,icd:10_diag:M7051 icd:10_diag:M7052,0.001000,0.0,0.0
3,560499202033740,icd:9_diag:V7612 icd:9_diag:V7612 [SEP] icd:9_...,0.076923,0.0,0.0
4,560499202037510,icd:9_diag:5246 icd:9_diag:83901 [SEP] icd:9_d...,0.164063,0.0,0.0
...,...,...,...,...,...
200526,560499899998870,icd:9_diag:8472 CYCLOBENZAPRINE_HCL [SEP] icd:...,0.001000,0.0,0.0
200527,560499899999040,icd:9_diag:41401 icd:9_diag:71536 icd:9_diag:7...,0.063712,0.0,0.0
200528,560499899999220,icd:9_diag:V202 [SEP] icd:9_diag:38870 NEOMY_S...,0.001000,0.0,0.0
200529,560499899999970,AMOX_TR/POTASSIUM_CLAVULANATE GUAIFENESIN/CODE...,0.205128,1.0,0.0


In [24]:
testdata = data[0:9]

In [36]:
treatment_propensity = 0.25 * testdata['treatment'] + beta_1 *(testdata['base_propensity_score'] - 0.2)
softmax_fn = torch.nn.Softmax()
input = torch.tensor(treatment_propensity.values)
reponse_prob = softmax_fn(input)
all_response = torch.bernoulli(reponse_prob)
all_response

  after removing the cwd from sys.path.


tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)