In [1]:
import pandas as pd

# Diabetes Dataset

In [2]:
db = pd.read_csv('diabetes.csv')

In [3]:
db.value_counts()

Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  BMI   DiabetesPedigreeFunction  Age  Outcome
17           163      72             41             114      40.9  0.817                     47   1          1
15           136      70             32             110      37.1  0.153                     43   1          1
2            87       58             16             52       32.7  0.166                     25   0          1
                      0              23             0        28.9  0.773                     25   0          1
             85       65             0              0        39.6  0.930                     27   0          1
                                                                                                            ..
5            106      82             30             0        39.5  0.286                     38   0          1
             105      72             29             325      36.9  0.159                     28   0          1
      

In [4]:
db.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [5]:
for c in ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']:
    print(c)
    db[c] = pd.qcut(db[c].values, 5, duplicates = 'drop').codes + 1

    print('************')

Pregnancies
************
Glucose
************
BloodPressure
************
SkinThickness
************
Insulin
************
BMI
************
DiabetesPedigreeFunction
************
Age
************


In [33]:
domain_db = {}
for c in db.columns:
    print(c)
    print(len(db[c].value_counts()))
    domain_db[c] = len(db[c].value_counts())
    print('************')

Pregnancies
5
************
Glucose
5
************
BloodPressure
5
************
SkinThickness
4
************
Insulin
3
************
BMI
5
************
DiabetesPedigreeFunction
5
************
Age
5
************
Outcome
2
************


In [34]:
domain_db

{'Pregnancies': 5,
 'Glucose': 5,
 'BloodPressure': 5,
 'SkinThickness': 4,
 'Insulin': 3,
 'BMI': 5,
 'DiabetesPedigreeFunction': 5,
 'Age': 5,
 'Outcome': 2}

In [7]:
len(db)

768

In [8]:
train_db = db.sample(n=(round(0.8*len(db))))
test_db = db[~db.index.isin(train_db.index)]

In [9]:
Alice_db_H = train_db.sample(n=(round(0.5*len(db))))
Bob_db_H = train_db[~train_db.index.isin(Alice_db_H.index)]

In [10]:
import random
Alice = random.sample(set(db.columns), 5)
Bob = list(set(db.columns) - set(Alice))

Alice_db_V = train_db[Alice]
Bob_db_V = train_db[Bob]

In [11]:
train_db.to_csv('diabetes_train.csv')
test_db.to_csv('diabetes_test.csv')

Alice_db_H.to_csv('diabetes_train_alice_h.csv')
Bob_db_H.to_csv('diabetes_train_bob_h.csv')

Alice_db_V.to_csv('diabetes_train_alice_v.csv')
Bob_db_V.to_csv('diabetes_train_bob_v.csv')

# NPHA Dataset

In [12]:
nh = pd.read_csv('NPHA-doctor-visits.csv')

In [13]:
nh.columns

Index(['Number of Doctors Visited', 'Age', 'Phyiscal Health', 'Mental Health',
       'Dental Health', 'Employment', 'Stress Keeps Patient from Sleeping',
       'Medication Keeps Patient from Sleeping',
       'Pain Keeps Patient from Sleeping',
       'Bathroom Needs Keeps Patient from Sleeping',
       'Uknown Keeps Patient from Sleeping', 'Trouble Sleeping',
       'Prescription Sleep Medication', 'Race', 'Gender'],
      dtype='object')

In [14]:
nh.head()

Unnamed: 0,Number of Doctors Visited,Age,Phyiscal Health,Mental Health,Dental Health,Employment,Stress Keeps Patient from Sleeping,Medication Keeps Patient from Sleeping,Pain Keeps Patient from Sleeping,Bathroom Needs Keeps Patient from Sleeping,Uknown Keeps Patient from Sleeping,Trouble Sleeping,Prescription Sleep Medication,Race,Gender
0,3,2,4,3,3,3,0,0,0,0,1,2,3,1,2
1,2,2,4,2,3,3,1,0,0,1,0,3,3,1,1
2,3,2,3,2,3,3,0,0,0,0,1,3,3,4,1
3,1,2,3,2,3,3,0,0,0,1,0,3,3,4,2
4,3,2,3,3,3,3,1,0,0,0,0,2,3,1,2


In [31]:
domain_nh = {}
for c in nh.columns:
    print(c)
    print(len(nh[c].value_counts()))
    domain_nh[c] = len(nh[c].value_counts())
    print('************')

Number of Doctors Visited
3
************
Age
1
************
Phyiscal Health
6
************
Mental Health
6
************
Dental Health
7
************
Employment
4
************
Stress Keeps Patient from Sleeping
2
************
Medication Keeps Patient from Sleeping
2
************
Pain Keeps Patient from Sleeping
2
************
Bathroom Needs Keeps Patient from Sleeping
2
************
Uknown Keeps Patient from Sleeping
2
************
Trouble Sleeping
4
************
Prescription Sleep Medication
4
************
Race
5
************
Gender
2
************


In [32]:
domain_nh

{'Number of Doctors Visited': 3,
 'Age': 1,
 'Phyiscal Health': 6,
 'Mental Health': 6,
 'Dental Health': 7,
 'Employment': 4,
 'Stress Keeps Patient from Sleeping': 2,
 'Medication Keeps Patient from Sleeping': 2,
 'Pain Keeps Patient from Sleeping': 2,
 'Bathroom Needs Keeps Patient from Sleeping': 2,
 'Uknown Keeps Patient from Sleeping': 2,
 'Trouble Sleeping': 4,
 'Prescription Sleep Medication': 4,
 'Race': 5,
 'Gender': 2}

In [16]:
train_nh = nh.sample(n=(round(0.8*len(nh))))
test_nh = nh[~nh.index.isin(train_nh.index)]

In [17]:
Alice_nh_H = train_nh.sample(n=(round(0.5*len(nh))))
Bob_nh_H = train_nh[~train_nh.index.isin(Alice_nh_H.index)]

In [18]:
import random
Alice = random.sample(set(nh.columns), 8)
Bob = list(set(nh.columns) - set(Alice))

Alice_nh_V = train_nh[Alice]
Bob_nh_V = train_nh[Bob]

In [19]:
train_nh.to_csv('NPHA_train.csv')
test_nh.to_csv('NPHA_test.csv')

Alice_nh_H.to_csv('NPHA_train_alice_h.csv')
Bob_nh_H.to_csv('NPHA_train_bob_h.csv')

Alice_nh_V.to_csv('NPHA_train_alice_v.csv')
Bob_nh_V.to_csv('NPHA_train_bob_v.csv')

# COMPAS Dataset

In [20]:
cp = pd.read_csv('compass_processed_train.csv')

In [21]:
cpt = pd.read_csv('compass_processed_test.csv')

In [22]:
cpt = cpt.drop('Unnamed: 0', axis = 1)
cp = cp.drop('Unnamed: 0', axis = 1)

In [29]:

domain_cp = {}
for c in cp.columns:
    print(c)
    print(len(cp[c].value_counts()))
    domain_cp[c] = len(cp[c].value_counts())
    print('************')

jfel
3
************
jmis
3
************
jot
3
************
pri
2
************
gender
2
************
age
3
************
charge
2
************
Race
2
************
two_year_recid
2
************


In [30]:
domain_cp

{'jfel': 3,
 'jmis': 3,
 'jot': 3,
 'pri': 2,
 'gender': 2,
 'age': 3,
 'charge': 2,
 'Race': 2,
 'two_year_recid': 2}

In [24]:
Alice_cp_H = cp.sample(n=(round(0.5*len(cp))))
Bob_cp_H = cp[~cp.index.isin(Alice_cp_H.index)]

In [25]:
import random
Alice = random.sample(set(cp.columns), 5)
Bob = list(set(cp.columns) - set(Alice))

Alice_cp_V = cp[Alice]
Bob_cp_V = cp[Bob]

In [26]:
cp.to_csv('COMPAS_train.csv')
cpt.to_csv('COMPAS_test.csv')

Alice_cp_H.to_csv('COMPAS_train_alice_h.csv')
Bob_cp_H.to_csv('COMPAS_train_bob_h.csv')

Alice_cp_V.to_csv('COMPAS_train_alice_v.csv')
Bob_cp_V.to_csv('COMPAS_train_bob_v.csv')