In [1]:
import pandas as pd
import numpy as np
import random
import math

In [2]:
np.random.seed(0)

### Generating sensitive variables

The population is normally distributed around people age 50 years old. Sex and race demographics based roughly on US averages from the Census. Sex is treated as a binary, race is bucketed into white, black, hispanic, asian, and other.

In [3]:
def generate_base_population(size):
    ages = np.rint(np.random.normal(50,7, size=size))
    sex = np.random.choice(2, size)
    race_usa = {'white': 0.6, 'hispanic': 0.18, 'black': 0.12, 'asian': .05, 'other': .05}
    races = random.choices(list(race_usa.keys()), weights=race_usa.values(), k=size)
    
    s_data = {'age': ages,
              'sex': sex,
              'race': races}
    df = pd.DataFrame(s_data)
    return df

### Generating independent variables

These are arbitrary predictive values generated independent of each other and independent of the sensitive variables.

In [4]:
n = 4000
df = generate_base_population(n)

# Random sample trait that 50% of people have
x1 = np.random.choice(2, n, p=[.5,.5])

# Random normally distributed trait
x2 = np.round(np.random.normal(100, 5, n),2)
x3 = np.round(np.random.normal(100, 5, n),2)

df.insert(0, "x3", x3, True)
df.insert(0, "x2", x2, True)
df.insert(0, "x1", x1, True)
df.head()

Unnamed: 0,x1,x2,x3,age,sex,race
0,1,94.44,105.71,62.0,0,hispanic
1,0,98.38,100.05,53.0,0,white
2,1,98.25,101.62,57.0,0,white
3,1,98.14,101.81,66.0,1,white
4,1,90.49,110.38,63.0,1,white


# Baseline Case

Using variables $x_1$, $x_2$, and $x_3$ and adding some noise, generate y.

In [5]:
def generate_outcome(row):
    out = (.5-row['x1']) + (100-row['x2']) + (100-row['x3']) + np.random.normal(0,1)
    return out

In [6]:
outcomes = []
for i in range(df.shape[0]):
    outcomes.append(generate_outcome(df.iloc[i]))
print(np.mean(outcomes))

-0.0939500435756061


In [7]:
df.insert(0, "disease", outcomes, True)

In [8]:
def one_hot_race(df):
    one_hot_df = pd.get_dummies(df['race'], prefix='race', drop_first=False)
    df = pd.concat([df, one_hot_df], axis=1).drop(['race'], axis=1)
    return df

In [9]:
df = one_hot_race(df)
df.head()

Unnamed: 0,disease,x1,x2,x3,age,sex,race_asian,race_black,race_hispanic,race_other,race_white
0,-0.065792,1,94.44,105.71,62.0,0,0,0,1,0,0
1,3.300917,0,98.38,100.05,53.0,0,0,0,0,0,1
2,-1.200286,1,98.25,101.62,57.0,0,0,0,0,0,1
3,1.188464,1,98.14,101.81,66.0,1,0,0,0,0,1
4,-3.058672,1,90.49,110.38,63.0,1,0,0,0,0,1


In [10]:
df.to_csv('../data/test_synthetic_health_base.csv', index=False)

# Injected Case

Injecting FID for subgroup of older, hispanic individuals.

In [11]:
v = []
for i in range(df.shape[0]):
    row = df.iloc[i]
    v.append((row['age'] + 10*row['race_hispanic']>60))
print(np.mean(v))

0.13825


In [12]:
def in_g(row):
    if (row['age'] + 10*row['race_hispanic']>60):
        return True
    else:
        return False

In [13]:
def generate_outcome_fid(row):
    if in_g(row):    
        out = (.5-row['x1']) + 50*(100-row['x2']) + (100-row['x3']) + np.random.normal(0,1)
    else:
        out = (.5-row['x1']) + (100-row['x2']) + (100-row['x3']) + np.random.normal(0,1)
    return out

In [14]:
df_fid = df.copy().drop("disease", axis=1)

In [15]:
outcomes_fid = []
for i in range(df_fid.shape[0]):
    outcomes_fid.append(generate_outcome_fid(df_fid.iloc[i]))
print(np.mean(outcomes_fid))

-2.384920963077148


In [16]:
df_fid.insert(0, "disease", outcomes_fid, True)

In [17]:
df_fid.to_csv('../data/test_synthetic_health_fid.csv', index=False)