This notebook explores the potential to calculate MPI at household level. It checks if all individuals in each household share the same MPI-related values for Cambodia DHS 2005.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

code_path = Path(r'C:\Users\tianc\OneDrive\Documents\SIG\DISES\code\MPI')
datafd_path = code_path.parent.parent / 'data' / 'MPI'

# khm dhs05

### read in individual-level data

#### one dataframe per cluster all in a list

##### harmonized with 10 and 14, wall material excluded (no wall)

In [2]:
survey = 'khm_dhs05_cot_nowall'
spatial_res = 'clust'
n_spatial_unit = 557

In [3]:
df_nowall_lst = []
for i in range(1, n_spatial_unit+1):
    df = pd.read_stata(
        datafd_path / survey / f'{"_".join(survey.split("_")[:2])}_mpi_{spatial_res}{i}.dta'
    ) 
    assert df.psu.unique().squeeze() == i
    df_nowall_lst.append(df)

### explore and preprocess data

#### compile dataframes in the lists

##### no wall

In [4]:
df_nowall = pd.concat(df_nowall_lst)

In [5]:
df_nowall.shape  # 35665 individuals

(35665, 109)

In [6]:
df_nowall.head(2)

Unnamed: 0,hh_id,ind_id,psu,strata,subsample,weight,area,region,region_01,agec4,...,g01_k_fuel_1,g01_k_asset_1,MPI_1_20,MPI_1_33,MPI_1_50,MPI_1,MPI_1_svy,MPI_1_SE,MPI_1_low95CI,MPI_1_upp95CI
0,10001,1000101,1,245,selected,0.414342,urban,Pursat,Pursat,60+,...,0.0,0.0,0.316162,0.260606,0.163636,0.260606,0.260606,0.088829,0.062683,0.458529
1,10001,1000102,1,245,selected,0.414342,urban,Pursat,Pursat,18-59,...,0.0,0.0,0.316162,0.260606,0.163636,0.260606,0.260606,0.088829,0.062683,0.458529


In [7]:
df_nowall.columns

Index(['hh_id', 'ind_id', 'psu', 'strata', 'subsample', 'weight', 'area',
       'region', 'region_01', 'agec4',
       ...
       'g01_k_fuel_1', 'g01_k_asset_1', 'MPI_1_20', 'MPI_1_33', 'MPI_1_50',
       'MPI_1', 'MPI_1_svy', 'MPI_1_SE', 'MPI_1_low95CI', 'MPI_1_upp95CI'],
      dtype='object', length=109)

#### filter columns of interest

In [8]:
columns_of_interest = [
    'hh_id', 'ind_id', 'psu', 'c_vector_1'
]

In [9]:
df_nowall = df_nowall[columns_of_interest]

#### summarize missingness and drop people with missing MPI

In [10]:
df_nowall.isna().sum()

hh_id           0
ind_id          0
psu             0
c_vector_1    924
dtype: int64

In [11]:
df_valid = df_nowall.dropna(how='any')

In [12]:
df_valid.shape

(34741, 4)

In [13]:
assert 34741 + 924 == 35665

#### confirm hh_id is unique across the country

In [14]:
(df_valid.groupby('hh_id')['psu'].nunique() == 1).all()
# each hh_id is associated with only one psu/cluster

True

### confirm all individuals in each household share the same c_vector_1

In [15]:
(df_valid.groupby('hh_id')['c_vector_1'].nunique() == 1).all()

True