In [1]:
!git clone https://github.com/UDST/synthpop.git
!pip install -r synthpop/requirements-dev.txt
!python synthpop/setup.py develop

Cloning into 'synthpop'...
remote: Enumerating objects: 1743, done.[K
remote: Counting objects: 100% (193/193), done.[K
remote: Compressing objects: 100% (160/160), done.[K
remote: Total 1743 (delta 115), reused 99 (delta 33), pack-reused 1550[K
Receiving objects: 100% (1743/1743), 995.54 KiB | 3.76 MiB/s, done.
Resolving deltas: 100% (1078/1078), done.
Collecting pytest-cov<2.10
  Downloading pytest_cov-2.9.0-py2.py3-none-any.whl (19 kB)
Collecting pycodestyle
  Downloading pycodestyle-2.7.0-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 421 kB/s 
[?25hCollecting numpydoc
  Downloading numpydoc-1.1.0-py3-none-any.whl (47 kB)
[K     |████████████████████████████████| 47 kB 5.9 MB/s 
Collecting sphinx_rtd_theme
  Downloading sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 17.1 MB/s 
[?25hCollecting coverage>=4.4
  Downloading coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl (242 kB)
[K     |███

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from synthpop.synthpop.zone_synthesizer import synthesize_all_zones, load_data
import pandas as pd
import numpy as np
import random
from io import StringIO

In [4]:
###Configuration

householdh_marginal_filename = '/content/drive/MyDrive/data/aastha synthetic/household_marg.csv'
individuals_marginal_filename = '/content/drive/MyDrive/data/aastha synthetic/person_marg.csv'

ihds_individuals_filename = "/content/drive/MyDrive/data/aastha synthetic/36151-0001-Data.tsv"
ihds_household_filename = "/content/drive/MyDrive/data/aastha synthetic/36151-0002-Data.tsv"
 
state_id = 27 ####### select state number and name appropriately

In [5]:
###Helper functions for data cleaning

def try_convert(value, default, *types):
    for t in types:
        try:
            return t(value)
        except (ValueError, TypeError):
            continue
    return default

In [6]:
###Filtering and preprocessing the ihds individuals dataset

ihds_individuals_data = pd.read_csv(ihds_individuals_filename, sep='\t')

filtered_ihds_individuals_data = ihds_individuals_data.loc[ihds_individuals_data.STATEID==state_id]

columns_to_keep_individuals = ['DISTRICT', 'IDHH', 'PERSONID', 'RO3', 'RO6', 'RO5','ED2', 'ID11', 'ID13', 'RO7', 'URBAN2011']
columns_rename_dict_individuals = {'RO3':'gender', 'RO5':'age', 'RO6':'marital_status',
    'ED2':'literacy', 'ED6':'edu_years', 'EDUC7': 'edu_cat',
    'ID11':'religion', 'ID13':'caste', 'URBAN2011':'residence',
    'WS4':'job', 'RO7':'activity_status', 'IDHH':'serialno', 'PERSONID':'mem_id',
    'DIST01':'district', 'MB3':'M_Cataract', 'MB4':'M_TB', 'MB5':'M_High_BP',
    'MB6':'M_Heart_disease', 'MB7':'M_Diabetes', 'MB8':'M_Leprosy',
    'MB9':'M_Cancer', 'MB10':'M_Asthma', 'MB11':'M_Polio',
    'MB12':'M_Paralysis', 'MB13':'M_Epilepsy', 'SM4':'M_Fever', 'SM5':'M_Cough',
    'SM7':'M_Diarrhea'}

filtered_ihds_individuals_data = filtered_ihds_individuals_data[columns_to_keep_individuals]
filtered_ihds_individuals_data = filtered_ihds_individuals_data.rename(columns_rename_dict_individuals, axis='columns')

individuals_data = filtered_ihds_individuals_data

gender_dict = {1:'male', '1':'male', 2:'female', '2':'female'}
individuals_data['gender'] = individuals_data['gender'].map(gender_dict)

individuals_data.loc[individuals_data['marital_status']==' ','marital_status'] = 1
individuals_data['marital_status'] = individuals_data['marital_status'].astype(int)
marital_dict = {0:'married', 1:'married', 2:'unmarried', 3:'widowed', 4: 'separated', 5: 'married'}
individuals_data['marital_status'] = individuals_data['marital_status'].map(marital_dict)

individuals_data.loc[individuals_data['literacy']==' ','literacy'] = 0
# individuals_data['activity_status'] = individuals_data['activity_status'].astype(int)
# individuals_data.loc[(individuals_data['literacy']==' ') & (individuals_data['activity_status'] >=5) & (individuals_data['activity_status']<=10),'literacy']=1
# individuals_data.loc[(individuals_data['literacy']==' ') & (individuals_data['activity_status'] ==12) & (individuals_data['age']!='0to4'),'literacy']=1

individuals_data['literacy'] = individuals_data['literacy'].astype(int)
individuals_data.loc[individuals_data.literacy == 1, 'literacy'] = 'literate'
individuals_data.loc[individuals_data.literacy == 0, 'literacy'] = 'illiterate'

bins= [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,110]
labels = ['0to4', '5to9', '10to14', '15to19','20to24', '25to29','30to34', '35to39', '40to44', '45to49',
         '50to54', '55to59', '60to64', '65to69', '70to74', '75to79', '80p']

individuals_data['age'] = individuals_data['age'].apply(lambda x : try_convert(x, np.float('nan'), int) )   

individuals_data['age'] = pd.cut(individuals_data['age'], bins=bins, labels=labels, right=False)

religion_dict = {1: 'hindu', 2:'muslim', 3:'christian', 4:'sikh', 5:'buddhist', 6:'jain',
                7: 'other', 8:'other', 9:'other'}
individuals_data['religion'] = individuals_data['religion'].map(religion_dict)

individuals_data.loc[individuals_data['caste']==' ', 'caste'] =  random.randint(1,6)
individuals_data.loc[(individuals_data['caste']==' ') & (individuals_data['religion']!='hindu'),'caste'] = 6
individuals_data['caste'] = individuals_data['caste'].astype(int)
caste_dict = {4: 'SC', 5:'ST', 1:'other', 2:'other', 3:'other', 6:'other'}
individuals_data['caste'] = individuals_data['caste'].map(caste_dict)

urbandict = {1:'urban', 0:'rural'}
individuals_data['residence'] = individuals_data['residence'].map(urbandict)

individuals_data['working'] = 'yes'
individuals_data['activity_status'] = individuals_data['activity_status'].apply(lambda x : try_convert(x, np.float('nan'), int) )   
individuals_data.loc[individuals_data.activity_status >= 10, 'working'] = 'no'

individuals_data = individuals_data.drop(['job', 'activity_status', 'edu_years'], axis=1, errors='ignore')

individuals_data.loc[individuals_data['literacy']=='illiterate','edu_cat'] = 'illiterate'
individuals_data['edu_cat'] = individuals_data['edu_cat'].astype(str)
individuals_data.loc[individuals_data['edu_cat']=='0','edu_cat'] = 'illiterate'
edu_dict = {'3': 'below_primary', '5':'primary', '8':'middle', '10':'secondary', '12':'senior_secondary',
           '15':'grad_p', '16':'grad_p'}
individuals_data['edu_cat'].replace(edu_dict, inplace=True)

individuals_data

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,DISTRICT,serialno,mem_id,gender,marital_status,age,literacy,religion,caste,residence,working,edu_cat
145702,2701,2701010111,1,male,married,50to54,literate,,ST,,yes,
145703,2701,2701010111,2,female,married,50to54,illiterate,,ST,,yes,illiterate
145704,2701,2701010111,3,male,unmarried,15to19,literate,,ST,,no,
145705,2701,2701010111,4,male,unmarried,10to14,literate,,ST,,no,
145706,2701,2701010112,1,male,married,30to34,literate,,ST,,yes,
...,...,...,...,...,...,...,...,...,...,...,...,...
161681,2734,2734031401,3,male,unmarried,20to24,literate,hindu,other,urban,no,
161682,2734,2734031501,1,male,married,45to49,literate,hindu,other,urban,yes,
161683,2734,2734031501,2,female,married,40to44,literate,hindu,other,urban,no,
161684,2734,2734031501,3,male,unmarried,15to19,literate,hindu,other,urban,no,


In [7]:
ihds_households_data = pd.read_csv("/content/drive/MyDrive/data/aastha synthetic/36151-0002-Data.tsv", sep='\t')
filtered_ihds_households_data = ihds_households_data.loc[ihds_households_data.STATEID==state_id] 

columns_to_keep_households = ['DIST01', 'IDHH', 'URBAN2011', 'NPERSONS']
columns_rename_dict_households = {'URBAN2011':'residence', 'IDHH':'serialno','DIST01':'district', 'NPERSONS':'hhsize'}

households_data = filtered_ihds_households_data[columns_to_keep_households]
households_data = households_data.rename(columns_rename_dict_households, axis='columns')

urbandict = {1:'urban', 0:'rural'}
households_data['residence'] = households_data['residence'].map(urbandict)

hhsize_dict = {1:'hhsize_1', 2:'hhsize_2', 3:'hhsize_3', 4:'hhsize_4', 5:'hhsize_5',
              6:'hhsize_6', 7:'hhsize_710', 8:'hhsize_710', 9:'hhsize_710',
              10:'hhsize_710', 11:'hhsize_1114', 12:'hhsize_1114', 13:'hhsize_1114',
              14:'hhsize_1114'}

households_data.loc[households_data['hhsize'] >=15, 'hhsize'] = 'hhsize_15p'
households_data['hhsize'] = households_data['hhsize'].replace(hhsize_dict)
households_data

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,district,serialno,residence,hhsize
28928,1,2701010111,rural,hhsize_4
28929,1,2701010112,rural,hhsize_2
28930,1,2701010211,rural,hhsize_710
28931,1,2701010311,rural,hhsize_710
28932,1,2701010411,rural,hhsize_4
...,...,...,...,...
32232,34,2734031202,urban,hhsize_3
32233,34,2734031203,urban,hhsize_4
32234,34,2734031301,urban,hhsize_4
32235,34,2734031401,urban,hhsize_3


In [8]:
empty_file_households = StringIO("1,2,3") #Creating empty files so that load_data function can be used which is built to load samples as well
empty_file_individuals = StringIO("1,2,3") #Creating empty files so that load_data function can be used which is built to load samples as well

household_marginal, individuals_marginal, hh_sample_empty, p_sample_empty, xwalk = load_data(householdh_marginal_filename, individuals_marginal_filename, empty_file_households, empty_file_individuals)

household_marginal = household_marginal[list(household_marginal.columns)].astype(float)
household_marginal = household_marginal[list(household_marginal.columns)].astype(int)

individuals_marginal = individuals_marginal[list(individuals_marginal.columns)].astype(float)
individuals_marginal = individuals_marginal[list(individuals_marginal.columns)].astype(int)

district_dict = pd.Series(individuals_marginal.index, index=individuals_marginal.distid.distid.values).to_dict()
individuals_data['DISTRICT'] = individuals_data['DISTRICT'].replace(district_dict)
households_data['district'] = households_data['district'].replace(district_dict)
households_data['sample_geog'] = 1
individuals_data['sample_geog'] = 1

household_marginal.drop('distid', axis=1, inplace=True)

individuals_marginal = individuals_marginal.drop(['distid','total_pop', 'residence'], axis=1)
individuals_marginal = individuals_marginal.drop(['illiterate_males','illiterate_females', 
                     'literate_males', 'literate_females',
                     'marginal_less3', 'marginal_6', 'non_worker'], axis=1, level=1)
individuals_marginal = individuals_marginal.rename({'main_workers': 'yes', 'non_worker2': 'no'}, axis='columns', level=1)

individuals_marginal[('marital_status','separated')] = (individuals_marginal['marital_status']['separated'] + individuals_marginal['marital_status']['divorced']).values

individuals_marginal[('edu_cat','senior_secondary')] = (individuals_marginal['edu_cat']['senior_secondary'] + individuals_marginal['edu_cat']['dip_cert_nontech'] + individuals_marginal['edu_cat']['dip_cert_tech']).values
individuals_marginal[('edu_cat','illiterate')] = (individuals_marginal['edu_cat']['illiterate'] + individuals_marginal['edu_cat']['lit_wo_edu']).values

individuals_marginal.drop(['divorced','dip_cert_nontech', 'dip_cert_tech', 'lit_wo_edu'], axis=1, level=1, inplace=True)

individuals_marginal = individuals_marginal.drop(['marital_status', 'edu_cat'], axis=1)

district_not_in_survey = ['Karnataka', 'Pune'] ####### remove rows based on data. This step needs to be adjusted when we add many rows to the marginal file.
xwalk_dict = dict(xwalk)
xwalk_dict = {key: xwalk_dict[key] for key in xwalk_dict if key not in district_not_in_survey}
xwalk = list(tuple(xwalk_dict.items()))

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [9]:
individuals_marginal

district,gender,gender,literacy,literacy,working,working,age,age,age,age,age,age,age,age,age,age,age,age,age,age,age,age,age,religion,religion,religion,religion,religion,religion,religion,caste,caste,caste
Unnamed: 0_level_1,male,female,illiterate,literate,yes,no,0to4,5to9,10to14,15to19,20to24,25to29,30to34,35to39,40to44,45to49,50to54,55to59,60to64,65to69,70to74,75to79,80p,hindu,muslim,christian,sikh,buddhist,jain,other,SC,ST,other
Karnataka,30966657,30128640,20447975,40647322,23397181,37698116,5046719,5241509,5736646,5827277,6052558,5771606,4698380,4695451,3826712,3519782,2699270,2142504,2060052,1520119,1061125,524426,625310,51317472,7893065,1142647,28773,95710,440280,177350,10474992,4248987,46371318
Pune,4924105,4505303,2257685,7171723,3751221,5678187,789688,761730,787915,819364,1028019,998018,819858,713370,597668,493269,407518,322148,290287,225194,160645,84744,95596,8090254,673704,134192,27090,340404,127786,35978,1180703,348876,7899829
Mumbai Suburban,5031323,4325639,1781477,7575485,3515922,5841040,648076,701121,767735,834831,999427,985170,837047,764467,649785,560702,452439,355509,287613,189642,131091,77042,115265,6337132,1795788,322476,47288,469568,343639,41071,583302,104560,8669100
Mumbai City,1684608,1400803,576389,2509022,1209334,1876077,189193,208264,230713,265548,334119,325168,272932,250947,217175,192532,158581,125961,103157,70896,52043,33506,54676,1873762,773173,84555,13471,134257,166000,40193,219934,25093,2840384


In [10]:
household_marginal

district,residence,residence,hhsize,hhsize,hhsize,hhsize,hhsize,hhsize,hhsize,hhsize,hhsize
Unnamed: 0_level_1,urban,rural,hhsize_1,hhsize_2,hhsize_3,hhsize_4,hhsize_5,hhsize_6,hhsize_710,hhsize_1114,hhsize_15p
Karnataka,5378520,7922270,579388,1357203,2145122,3510223,2451129,1421726,1559298,206936,69765
Pune,1364433,776913,94274,244455,375801,596082,373767,212028,212910,25886,6143
Mumbai Suburban,2094171,0,96133,214873,365500,551993,383648,216058,235835,24077,6054
Mumbai City,658359,0,37428,68610,106512,160719,117173,68850,84870,10520,3677


In [11]:
xwalk

[('Mumbai Suburban', 1), ('Mumbai City', 1)]

In [12]:
#Dropping columns which lead to error
household_marginal.drop(['num_workers','hhsize'], axis=1, errors='ignore', inplace=True)
households_data.drop(['num_workers','hhsize'], axis=1, errors='ignore', inplace=True)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [13]:
synthetic_households, synthetic_individuals, synthetic_stats = synthesize_all_zones(household_marginal, individuals_marginal, households_data, individuals_data, xwalk)

  adj = constraint / float((column * weights).sum())


Drawing 2094171 households


  adj = constraint / float((column * weights).sum())


Drawing 658359 households


In [14]:
synthetic_households

Unnamed: 0_level_0,district,serialno,residence,sample_geog,cat_id,geog
household_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,7,2707080901,urban,1,0,Mumbai Suburban
2,Pune,2725140901,urban,1,0,Mumbai Suburban
3,34,2734010201,urban,1,0,Mumbai Suburban
4,30,2730081202,urban,1,0,Mumbai Suburban
5,21,2721140601,urban,1,0,Mumbai Suburban
...,...,...,...,...,...,...
2752526,21,2721100101,urban,1,0,Mumbai City
2752527,Mumbai Suburban,2723041109,urban,1,0,Mumbai City
2752528,21,2721070301,urban,1,0,Mumbai City
2752529,21,2721041301,urban,1,0,Mumbai City


In [15]:
synthetic_individuals

Unnamed: 0,DISTRICT,serialno,mem_id,gender,marital_status,age,literacy,religion,caste,residence,working,edu_cat,sample_geog,cat_id,geog,household_id
0,2711,2711010711,1,male,married,70to74,literate,hindu,other,urban,yes,,1,1012,Mumbai Suburban,1396
1,2711,2711010711,1,male,married,70to74,literate,hindu,other,urban,yes,,1,1012,Mumbai Suburban,1809
2,2711,2711010711,1,male,married,70to74,literate,hindu,other,urban,yes,,1,1012,Mumbai Suburban,1881
3,2711,2711010711,1,male,married,70to74,literate,hindu,other,urban,yes,,1,1012,Mumbai Suburban,1928
4,2711,2711010711,1,male,married,70to74,literate,hindu,other,urban,yes,,1,1012,Mumbai Suburban,2106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284482,2730,2730081201,1,female,widowed,45to49,illiterate,muslim,other,urban,yes,illiterate,1,1624,Mumbai City,2750732
11284483,2730,2730081201,1,female,widowed,45to49,illiterate,muslim,other,urban,yes,illiterate,1,1624,Mumbai City,2750810
11284484,2730,2730081201,1,female,widowed,45to49,illiterate,muslim,other,urban,yes,illiterate,1,1624,Mumbai City,2751127
11284485,2730,2730081201,1,female,widowed,45to49,illiterate,muslim,other,urban,yes,illiterate,1,1624,Mumbai City,2751318
