In [2]:
import pandas as pd
import numpy as np

In [3]:
# specify file paths
path_to_bubble_size_data = "/Users/sophieayling/Library/CloudStorage/GoogleDrive-sophie2ayling@gmail.com/My Drive/PhD/08_Fieldwork/08_Data/R9_data/excel/to_inform_census_bubbles.xlsx"
path_to_census_data = "/Users/sophieayling/Library/CloudStorage/OneDrive-UniversityCollegeLondon/GitHub/Disease-Modelling-SSA/data/preprocessed/census/5_perc/ipums_5p_2012_preprocessed.dta"
save_path = "/Users/sophieayling/Library/CloudStorage/GoogleDrive-sophie2ayling@gmail.com/My Drive/PhD/08_Fieldwork/08_Data/R9_data/model_input/5p_census_w_bubbles.dta"

In [5]:
# load in the bubble size data
bubble_size = pd.read_excel(path_to_bubble_size_data, sheet_name='comb_transposed',
                            index_col='work_bubble')

# convert percentages into probabilities
bubble_size = bubble_size / 100
#replace all NaN with zeros
bubble_size.fillna(0, inplace=True)
# load in the census data
census = pd.read_stata(path_to_census_data)
bubble_size.info

<bound method DataFrame.info of                          0      1      2      3      4      5      6     \
work_bubble                                                               
student                 0.007  0.000  0.000  0.001  0.000  0.001  0.002   
ag_estates              0.325  0.014  0.036  0.047  0.039  0.065  0.063   
manu_mining_trades      0.202  0.031  0.076  0.095  0.021  0.102  0.088   
police_army             0.090  0.000  0.104  0.179  0.015  0.060  0.104   
education               0.068  0.000  0.014  0.023  0.032  0.046  0.009   
healthcare_social_work  0.051  0.026  0.090  0.064  0.026  0.077  0.026   
service_retail          0.149  0.010  0.042  0.063  0.029  0.097  0.037   
informal_petty trade    0.374  0.008  0.041  0.053  0.027  0.085  0.025   
subsistence_ag          0.540  0.060  0.090  0.069  0.026  0.064  0.036   
unemployed_not_ag       0.637  0.013  0.037  0.049  0.040  0.073  0.041   
other                   0.117  0.000  0.018  0.081  0.036  0.081  0.

In [15]:
# create a new column for workplace bubble ids, fill the column with 'none' as default
census['work_bubble_id'] = ["none"] * len(census)
# get a list of occupations that appear in the census
occs = census.occ4.unique()
# get a list of locations that appear in the census
locations = census.geo1_zw2012.unique()

In [16]:
# loop over the locations and occupations that appear in the census
for loc in locations:
    for occ in occs:
        # filter the census file to get the people in location 'loc' that have occupation 'occ', store this as a new
        # dataframe
        people_in_loc_with_occ = census.loc[(census['occ4'] == occ) & (census['geo1_zw2012'] == loc)]
        # get the initial number of people who meet the above criteria
        number_of_people_not_in_bubble = len(people_in_loc_with_occ)
        # create a number to assign to the bubble numbers (note this is reset when we look at different
        # location/occupation combos
        group_id_generator = 1
        # use a while loop to create the workplace bubbles, this loop keeps going until the randomly generated
        # bubble size is greater than the remaining number of people without an assigned bubble
        while number_of_people_not_in_bubble > 0:
            bubble_size_for_this = np.random.choice(bubble_size.columns, p=np.divide(bubble_size.loc[occ].values,
                                                    sum(bubble_size.loc[occ].values)))

            # bubble sizes are self-reported so a bubble size of 0 means they have no co-workers, and have a bubble size
            # of one (just themselves) adjust the generated bubble size below
            bubble_size_for_this += 1
            # if the generated bubble size is greater than the remaining people exit the loop and move on
            if bubble_size_for_this > len(people_in_loc_with_occ):
                break
            # sample from the people in a location based on the bubble size created above
            people_in_bubble = people_in_loc_with_occ.sample(n=bubble_size_for_this)
            # create the workplace bubble for these people
            census.loc[people_in_bubble.index, 'work_bubble_id'] = loc + "_" + occ + "_" + str(group_id_generator)
            # update the group_id number so that the next group will have a different workplace id
            group_id_generator += 1
            # calculate the number of people who don't have a workplace bubble again
            number_of_people_not_in_bubble = number_of_people_not_in_bubble - len(people_in_bubble)
            # remove the people we have just assigned a workplace bubble to from the pool of people we are going to
            # assign bubbles to
            people_in_loc_with_occ = census.loc[(census['occ4'] == occ) & (census['geo1_zw2012'] == loc) &
                                                (census['work_bubble_id'] == 'none')]
        # randomly assign the remaining people into existing bubbles
        straddlers = census.loc[(census['occ4'] == occ) & (census['geo1_zw2012'] == loc) &
                                (census['work_bubble_id'] == 'none')]
        other_group_ids = census.loc[(census['occ4'] == occ) & (census['geo1_zw2012'] == loc) &
                                     (census['work_bubble_id'] != 'none'), 'work_bubble_id'].unique()
        for straddler in straddlers.index:
            try:
                census.loc[straddler, 'work_bubble_id'] = np.random.choice(other_group_ids)
            except ValueError:
                census.loc[straddlers.index, 'work_bubble_id'] = loc + "_" + occ + "_" + str(group_id_generator)
                print("No existing bubbles to place these people into, reached this part of the code as the bubble size"
                      " initially selected for " + occ + " in " + loc + " was too large. Creating a new bubble for all "
                      "these people")
                break

No existing bubbles to place these people into, reached this part of the code as the bubble size initially selected for education in Matabeleland North was too large. Creating a new bubble for all these people
No existing bubbles to place these people into, reached this part of the code as the bubble size initially selected for education in Matabeleland South was too large. Creating a new bubble for all these people
No existing bubbles to place these people into, reached this part of the code as the bubble size initially selected for religious in Masvingo was too large. Creating a new bubble for all these people
No existing bubbles to place these people into, reached this part of the code as the bubble size initially selected for religious in Harare was too large. Creating a new bubble for all these people


In [None]:
# similarly generate contacts per person according to a distribution 


In [18]:
census.to_stata(save_path)