In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyreadstat
import os

In [2]:
pd.set_option('display.max_columns', None)
os.chdir('C:/Users/511232/Desktop/MICS/microdata')
[f for f in os.listdir() if 'sav' in f]

['bh.sav', 'ch.sav', 'fs.sav', 'hh.sav', 'hl.sav', 'wm.sav']

In [14]:
'''data processing prior to generating crosstabs'''

def process_data_wm():

    #reading in the .sav files and their metadata files
    os.chdir('C:/Users/511232/Desktop/MICS/microdata')
    df_hh,meta_hh=pyreadstat.read_sav('hh.sav', apply_value_formats=False)
    df_wm,meta_wm=pyreadstat.read_sav('wm.sav', apply_value_formats=False)
    df_hl,meta_hl=pyreadstat.read_sav('hl.sav', apply_value_formats=False)
    
    col_names_hh=meta_hh.column_names_to_labels
    col_vals_hh=meta_hh.variable_value_labels
    col_names_hl=meta_hl.column_names_to_labels
    col_vals_hl=meta_hl.variable_value_labels
    col_names_wm=meta_wm.column_names_to_labels
    col_vals_wm=meta_wm.variable_value_labels

    data_hh=df_hh.copy()
    data_wm=df_wm.copy()
    data_hl=df_hl.copy()

    ##############################################################
    os.chdir('C:/Users/511232/Desktop/MICS/Crosstabs')
    disability_levels={1:'No difficulty',
    2:'Some difficulty',
    3:'A lot of difficulty',
    4:'Cannot do at all'}
    
    disability_cols=['AF6','AF8','AF9','AF10','AF11','AF12']
    
    dis_names={'AF6': 'Difficulty seeing, even if wearing glasses or contact lenses',
    'AF8': 'Difficulty hearing, even if using a hearing aid',
    'AF9': 'Difficulty walking or climbing steps',
    'AF10': 'Difficulty remembering or concentrating',
    'AF11': 'Difficulty with self-care, such as washing all over or dressing',
    'AF12': 'Difficulty communicating'}

    #merge with data_hl to get the HL3 household head relation
    right_df=data_hl[['HH1','HH2','HL1','HL3']]
    left_df=data_wm

    df=pd.merge(left_df,right_df, how='left', 
    left_on=['HH1','HH2','LN'], right_on=['HH1','HH2','HL1'])

    #create 'disability_combined' column. takes the max(code) among ['AF6','AF8','AF9','AF10','AF11','AF12']
    df['disability_combined']=df[disability_cols].apply(lambda x: x.max(), axis=1)
    df['disability_combined']=df['disability_combined'].map(disability_levels)
    
    other_cols=['WAGE','HH6','disability', 'HL3']
    for col in other_cols:
        if col in col_vals_wm.keys():
            df[col]=df[col].map(col_vals_wm[col])
            print(f'{col} codes are translated from meta women')
        elif col in col_vals_wm.keys():
            df[col]=df[col].map(col_vals_hl[col])
            print(f'{col} codes are translated from meta hhl')
        else:
            print(f'!!! WARNING !!! {col} codes were not translated')

    return(df,disability_levels,disability_cols,dis_names)        

In [15]:
df,disability_levels,disability_cols,dis_names=process_data_wm()

WAGE codes are translated from meta women
HH6 codes are translated from meta women
disability codes are translated from meta women


In [101]:
'''Table 1
get crosstab for disbability types combined
criteria: take the maximum() over the different types of disabilities'''

def combined_disabilities(age_disaggregated=1):
    
    #process data
    df,disability_levels,disability_cols,dis_names=process_data_wm()

    #crosstab
    if age_disaggregated:
        xtab=pd.crosstab([df['HH6'],df['disability'],df['disability_combined']],df['WAGE'],
        rownames=['Area','Disability','Disability level'],colnames=['Age'], values=df['wmweight'], aggfunc='sum',dropna=False)      
        #export as excel
        xtab.to_excel('xtab_all_dis_ByAge.xlsx')
    else:
        xtab=pd.crosstab([df['disability'],df['disability_combined']],df['HH6'],
        rownames=['Disability','Disability level'],colnames=['Area'], values=df['wmweight'], aggfunc='sum',dropna=False)
        #export as excel
        xtab.to_excel('xtab_all_dis_ByTotalAge.xlsx')

In [102]:
combined_disabilities(age_disaggregated=1)
combined_disabilities(age_disaggregated=0)

In [103]:
'''Table 2
get xtab for all types of disability by creating them individually and 
appending the resulted series after stack()'''

def separate_disabilities():
    #process data
    df,disability_levels,disability_cols,dis_names=process_data_wm()
    
    #will generate a list of multiindex series for each disability
    #generate a crosstab then stack to make it a multiindex series and put them 
    #all in a generator
    def xtab():
        for col in disability_cols:
            #translate the codes
            df[col]=df[col].map(disability_levels)
            r=pd.crosstab([df['HH6'],df['disability'],df[col]],df['WAGE'],\
                rownames=['Area','Disability','Level'],colnames=['Age'], values=df['wmweight'], aggfunc='sum').stack()
            r.name=dis_names[col]
            yield(r)

    #concatenating the series in the resulting generator
    s=xtab()
    t=pd.concat(s, axis=1)
    t['All_disabilities']=t.sum(axis=1)

    #reshape the result
    T=t.stack().unstack([4,3]).sort_index(axis=1, level=0)
    T.to_excel('separate disabilites.xlsx')

In [104]:
separate_disabilities()

In [105]:
'''Table 4
disability domains are counted if (4-cannot at all) or (3-a lot of difficulty)
'''
def num_dis_domain():
    df,disability_levels,disability_cols,dis_names=process_data_wm()
    #for each row under disability_cols if the row contains 3 or 4 then True
    #sum over all the True/False results 
    df['domain_num']=df[disability_cols].apply(lambda x: sum(x.isin([3,4])), axis=1)
    #generate xtab
    r=pd.crosstab([df['HH6'],df['disability']],df['domain_num'],\
        rownames=['Area','Disability'],colnames=['Number of domains'], values=df['wmweight'], aggfunc='sum')
    
    r.to_excel('Number_dis_domain.xlsx')

In [106]:
num_dis_domain()

In [107]:
'''Table 5 marital status'''

def marital_status():
    marital_status= {1.0: 'Currently married/in union',
    2.0: 'Formerly married/in union',
    3.0: 'Never married/in union',
    9.0: 'No response'}

    #process data
    df,disability_levels,disability_cols,dis_names=process_data_wm()
    #translate the codes
    df['MSTATUS']=df['MSTATUS'].map(marital_status)

    #crosstab
    xtab=pd.crosstab([df['HH6'],df['MSTATUS'],df['disability'],df['disability_combined']],df['WAGE'],
    rownames=['Area','Marital status','Disability','Disability level'],colnames=['Age'], values=df['wmweight'],
    aggfunc='sum',dropna=False)      
    #export as excel
    xtab.to_excel('MaritalStatus.xlsx')

In [108]:
marital_status()


In [1]:
'''Table 6'''


'Table 6'

In [11]:
df_wm[['windex5','windex5u','windex5r']]

Unnamed: 0,windex5,windex5u,windex5r
0,2.0,,1.0
1,4.0,,3.0
2,4.0,,3.0
3,4.0,,3.0
4,5.0,,4.0
...,...,...,...
11459,1.0,1.0,
11460,1.0,1.0,
11461,2.0,2.0,
11462,1.0,1.0,


In [13]:
# col_names_hh
# col_vals_hh

# col_names_hl
col_vals_hl

# col_names_wm
# col_vals_wm

{'HL3': {1.0: 'HEAD',
  2.0: 'SPOUSE / PARTNER',
  3.0: 'SON / DAUGHTER',
  4.0: 'SON-IN-LAW / DAUGHTER-IN-LAW',
  5.0: 'GRANDCHILD',
  6.0: 'PARENT',
  7.0: 'PARENT-IN-LAW',
  8.0: 'BROTHER / SISTER',
  9.0: 'BROTHER-IN-LAW / SISTER-IN-LAW',
  10.0: 'UNCLE / AUNT',
  11.0: 'NIECE / NEPHEW',
  12.0: 'OTHER RELATIVE',
  13.0: 'ADOPTED / FOSTER / STEPCHILD',
  14.0: 'SERVANT (LIVE-IN)',
  96.0: 'OTHER (NOT RELATED)',
  98.0: 'DK'},
 'HL4': {1.0: 'MALE', 2.0: 'FEMALE'},
 'HL4A': {1.0: 'REGISTERED REFUGEE',
  2.0: 'NON REGISTERED REFUGEE',
  3.0: 'NON  REFUGEE'},
 'HL5M': {1.0: 'JANUARY',
  2.0: 'FEBRUARY',
  3.0: 'MARCH',
  4.0: 'APRIL',
  5.0: 'MAY',
  6.0: 'JUNE',
  7.0: 'JULY',
  8.0: 'AUGUST',
  9.0: 'SEPTEMBER',
  10.0: 'OCTOBER',
  11.0: 'NOVEMBER',
  12.0: 'DECEMBER',
  98.0: 'DK',
  99.0: 'NO RESPONSE'},
 'HL5Y': {9998.0: 'DK', 9999.0: 'NO RESPONSE'},
 'HL6': {95.0: '95+', 98.0: 'DK', 99.0: 'NO RESPONSE'},
 'HL8': {0.0: 'NOT ELIGIBLE'},
 'HL10': {0.0: 'NOT ELIGIBLE'},
 'HL11': {1.

In [9]:
# common_cols=set(df_hl.columns).intersection(set(df_wm.columns))
# print(list(zip(list(common_cols),pd.Series(list(common_cols)).map(col_names_hh))))

# merge hh with wm on HH1,HH2 and add HH6 from hh
right_df=df_hl[['HH1','HH2','HL3']]
left_df=df_wm

df=pd.merge(left_df,right_df, how='left', 
left_on=['HH1','HH2','LN'], right_on=['HH1','HH2','HL1'])

[('wscorec', 'Camp wealth score'), ('J1', 'Cluster in J1 area'), ('stratum', 'Sample strata'), ('windex5c', 'Camp wealth index quintile'), ('HH4', 'Supervisor number'), ('windex5u', 'Urban wealth index quintile'), ('wscorer', 'Rural wealth score'), ('nat_reg_lvl', 'National Region Level'), ('HH1', 'Cluster number'), ('windex10u', 'Percentile Group of urb1'), ('HH7', 'Governorate'), ('HH2', 'Household number'), ('HH6', 'Area'), ('windex10c', 'Percentile Group of camp1'), ('windex5r', 'Rural wealth index quintile'), ('windex5', 'Wealth index quintile'), ('Refugee', nan), ('windex10r', 'Percentile Group of rur1'), ('wscoreu', 'Urban wealth score'), ('wscore', 'Combined wealth score'), ('REGION', 'Region'), ('windex10', 'Percentile Group of com1'), ('PSU', 'Primary sampling unit')]


In [91]:
col_vals_wm

{'WM6M': {1.0: 'JANUARY',
  2.0: 'FEBRUARY',
  3.0: 'MARCH',
  4.0: 'APRIL',
  5.0: 'MAY',
  6.0: 'JUNE',
  7.0: 'JULY',
  8.0: 'AUGUST',
  9.0: 'SEPTEMBER',
  10.0: 'OCTOBER',
  11.0: 'NOVEMBER',
  12.0: 'DECEMBER'},
 'WM8': {1.0: 'YES, INTERVIEWED ALREADY', 2.0: 'NO, FIRST INTERVIEW'},
 'WM9': {1.0: 'YES', 2.0: 'NO / NOT ASKED'},
 'WM17': {1.0: 'COMPLETED',
  2.0: 'NOT AT HOME',
  3.0: 'REFUSED',
  5.0: 'INCAPACITATED',
  6.0: 'NO ADULT CONSENT FOR RESPONDENT AGE 15-17',
  96.0: 'OTHER'},
 'WM11': {1.0: 'YES, THE ENTIRE INTERVIEW WAS COMPLETED IN PRIVATE',
  2.0: 'NO, OTHERS WERE PRESENT DURING THE ENTIRE INTERVIEW',
  3.0: 'NO, OTHERS WERE PRESENT DURING PART OF THE INTERVIEW'},
 'WMFIN': {1.0: 'REVIEW QUESTIONNAIRE',
  2.0: 'ADD NOTES',
  3.0: 'SAVE QUESTIONNAIRE AND FINISH'},
 'WB3M': {1.0: 'JANUARY',
  2.0: 'FEBRUARY',
  3.0: 'MARCH',
  4.0: 'APRIL',
  5.0: 'MAY',
  6.0: 'JUNE',
  7.0: 'JULY',
  8.0: 'AUGUST',
  9.0: 'SEPTEMBER',
  10.0: 'OCTOBER',
  11.0: 'NOVEMBER',
  12.0: 'DE

In [89]:
col_vals_hl

{'HL3': {1.0: 'HEAD',
  2.0: 'SPOUSE / PARTNER',
  3.0: 'SON / DAUGHTER',
  4.0: 'SON-IN-LAW / DAUGHTER-IN-LAW',
  5.0: 'GRANDCHILD',
  6.0: 'PARENT',
  7.0: 'PARENT-IN-LAW',
  8.0: 'BROTHER / SISTER',
  9.0: 'BROTHER-IN-LAW / SISTER-IN-LAW',
  10.0: 'UNCLE / AUNT',
  11.0: 'NIECE / NEPHEW',
  12.0: 'OTHER RELATIVE',
  13.0: 'ADOPTED / FOSTER / STEPCHILD',
  14.0: 'SERVANT (LIVE-IN)',
  96.0: 'OTHER (NOT RELATED)',
  98.0: 'DK'},
 'HL4': {1.0: 'MALE', 2.0: 'FEMALE'},
 'HL4A': {1.0: 'REGISTERED REFUGEE',
  2.0: 'NON REGISTERED REFUGEE',
  3.0: 'NON  REFUGEE'},
 'HL5M': {1.0: 'JANUARY',
  2.0: 'FEBRUARY',
  3.0: 'MARCH',
  4.0: 'APRIL',
  5.0: 'MAY',
  6.0: 'JUNE',
  7.0: 'JULY',
  8.0: 'AUGUST',
  9.0: 'SEPTEMBER',
  10.0: 'OCTOBER',
  11.0: 'NOVEMBER',
  12.0: 'DECEMBER',
  98.0: 'DK',
  99.0: 'NO RESPONSE'},
 'HL5Y': {9998.0: 'DK', 9999.0: 'NO RESPONSE'},
 'HL6': {95.0: '95+', 98.0: 'DK', 99.0: 'NO RESPONSE'},
 'HL8': {0.0: 'NOT ELIGIBLE'},
 'HL10': {0.0: 'NOT ELIGIBLE'},
 'HL11': {1.

In [65]:
col_names_wm

{'HH1': 'Cluster number',
 'HH2': 'Household number',
 'LN': 'Line number',
 'WM1': 'Cluster number',
 'WM2': 'Household number',
 'WM3': "Woman's line number",
 'WMINT': 'Interviewer number',
 'WM4': 'Supervisor number',
 'WM5': 'Interviewer number',
 'WM6D': 'Day of interview',
 'WM6M': 'Month of interview',
 'WM6Y': 'Year of interview',
 'WM8': 'Respondent to another questionnaire',
 'WM9': 'Consent',
 'WM17': "Result of woman's interview",
 'WM7H': 'Start of interview - Hour',
 'WM7M': 'Start of interview - Minutes',
 'WM10H': 'End of interview - Hour',
 'WM10M': 'End of interview - Minutes',
 'WM11': 'Interview completed in private',
 'WMHINT': 'Household interviewer',
 'WMFIN': 'Finish',
 'WB3M': 'Month of birth of woman',
 'WB3Y': 'Year of birth of woman',
 'WB4': 'Age of woman',
 'WB5': 'Ever attended school',
 'WB6A': 'Highest level of school attended',
 'WB6B': 'Highest grade attended at that level',
 'WB7': 'Ever completed that grade/year',
 'WB9': 'Attended school during cu

In [114]:
col_vals_wm

{'WM6M': {1.0: 'JANUARY',
  2.0: 'FEBRUARY',
  3.0: 'MARCH',
  4.0: 'APRIL',
  5.0: 'MAY',
  6.0: 'JUNE',
  7.0: 'JULY',
  8.0: 'AUGUST',
  9.0: 'SEPTEMBER',
  10.0: 'OCTOBER',
  11.0: 'NOVEMBER',
  12.0: 'DECEMBER'},
 'WM8': {1.0: 'YES, INTERVIEWED ALREADY', 2.0: 'NO, FIRST INTERVIEW'},
 'WM9': {1.0: 'YES', 2.0: 'NO / NOT ASKED'},
 'WM17': {1.0: 'COMPLETED',
  2.0: 'NOT AT HOME',
  3.0: 'REFUSED',
  5.0: 'INCAPACITATED',
  6.0: 'NO ADULT CONSENT FOR RESPONDENT AGE 15-17',
  96.0: 'OTHER'},
 'WM11': {1.0: 'YES, THE ENTIRE INTERVIEW WAS COMPLETED IN PRIVATE',
  2.0: 'NO, OTHERS WERE PRESENT DURING THE ENTIRE INTERVIEW',
  3.0: 'NO, OTHERS WERE PRESENT DURING PART OF THE INTERVIEW'},
 'WMFIN': {1.0: 'REVIEW QUESTIONNAIRE',
  2.0: 'ADD NOTES',
  3.0: 'SAVE QUESTIONNAIRE AND FINISH'},
 'WB3M': {1.0: 'JANUARY',
  2.0: 'FEBRUARY',
  3.0: 'MARCH',
  4.0: 'APRIL',
  5.0: 'MAY',
  6.0: 'JUNE',
  7.0: 'JULY',
  8.0: 'AUGUST',
  9.0: 'SEPTEMBER',
  10.0: 'OCTOBER',
  11.0: 'NOVEMBER',
  12.0: 'DE

In [115]:
col_names_wm

{'HH1': 'Cluster number',
 'HH2': 'Household number',
 'LN': 'Line number',
 'WM1': 'Cluster number',
 'WM2': 'Household number',
 'WM3': "Woman's line number",
 'WMINT': 'Interviewer number',
 'WM4': 'Supervisor number',
 'WM5': 'Interviewer number',
 'WM6D': 'Day of interview',
 'WM6M': 'Month of interview',
 'WM6Y': 'Year of interview',
 'WM8': 'Respondent to another questionnaire',
 'WM9': 'Consent',
 'WM17': "Result of woman's interview",
 'WM7H': 'Start of interview - Hour',
 'WM7M': 'Start of interview - Minutes',
 'WM10H': 'End of interview - Hour',
 'WM10M': 'End of interview - Minutes',
 'WM11': 'Interview completed in private',
 'WMHINT': 'Household interviewer',
 'WMFIN': 'Finish',
 'WB3M': 'Month of birth of woman',
 'WB3Y': 'Year of birth of woman',
 'WB4': 'Age of woman',
 'WB5': 'Ever attended school',
 'WB6A': 'Highest level of school attended',
 'WB6B': 'Highest grade attended at that level',
 'WB7': 'Ever completed that grade/year',
 'WB9': 'Attended school during cu