In [14]:
import pandas as pd
import numpy as np
from itertools import product

In [15]:
df=pd.read_csv('pop_availability_test.csv')
#process Year
df['Year'] = df['Year'].str[1:].astype(int)
#create datapoint_available column
df['datapoint_available']=df['Value'].notna().astype(int)

#replace 'Not applicable' with Total  to apply groupby later on, and get rid of NAs
columns_to_replace = ['Age group', 'Area', 'Nationality']

# Replace "Not applicable" with "total" only in those columns
df[columns_to_replace] = df[columns_to_replace].replace('Not applicable', 'Total')
df['Sex'] = df['Sex'].replace('Not applicable', 'Both sexes')

df = df.dropna(subset=['Value'])

df.head()

Unnamed: 0,Indicator,Country,Age group,Area,Sex,Nationality,Year,Value,datapoint_available
0,Occupied housing units by average household si...,Iraq,Total,Total,Both sexes,Nationals,2021,6.4,1
1,Occupied housing units by average household si...,Iraq,Total,Total,Both sexes,Non-nationals,2021,5.6,1
2,Occupied housing units by average household si...,Kuwait,Total,Total,Both sexes,Nationals,2005,8.6,1
3,Occupied housing units by average household si...,Kuwait,Total,Total,Both sexes,Non-nationals,2005,5.6,1
4,Occupied housing units by average household si...,Kuwait,Total,Total,Both sexes,Nationals,2011,5.7,1


In [16]:
indicator_criteria={'Population Size':{'aggregate':'Sex','keep':['Female','Male']}, 
          'Mean age at first marriage':{'aggregate':'Sex','keep':['Female','Male']}, 
          'Occupied housing units by average household size (%)':{'aggregate':'Area','keep':['Urban','Rural','Total']}
}

In [17]:
def availability(group,cnt):
    
    # Initialize availability
    group['availability'] = 0
    #count the non na-s
    count = int(group['Value'].notna().sum())
    #availability= 1 for total response, 0 for partial response
    if count==cnt:
        availability=1
    else:
        availability=0

    group['availability']=availability

    return group

In [18]:
'''loop though indicators. only keep the relevant columns, for example if its on sex disaggregated we dont need the nationality
orelse keeping it the condition in the above function def availability(group,cnt) wont work since we will end up with count>cnt'''

indicators=list(df['Indicator'].unique())

df_list=[]

for ind in indicators:
    #filter on the indicator
    df_sub=df[df['Indicator']==ind].copy()
    criteria=indicator_criteria[ind]

    #get the column and value to filter on
    col=criteria['aggregate']
    to_keep=criteria['keep']
    cnt=len(criteria['keep'])
    

    df_filtered=df_sub[df_sub[col].isin(to_keep)].copy()
    #keep the relevant columns only
    df_filtered1=df_filtered[['Indicator','Country', 'Year',col,'Value']]
    #groupby by col to remove any repetitions coming from other disaggregations
    df_filtered2 = df_filtered1.groupby(['Indicator','Country', 'Year',col], as_index=False).agg({'Value': 'first'})
    #group by sex and calculate total/partial availability, the group_keys=False not to include group labels as part of the index
    df_grouped = df_filtered2.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)
    #append to the df_list
    df_list.append(df_grouped)


final_df = pd.concat(df_list, ignore_index=True)
final_df.to_excel('grouped.xlsx',index=False)


  df_grouped = df_filtered2.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)
  df_grouped = df_filtered2.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)
  df_grouped = df_filtered2.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)


In [19]:
final_df['Indicator'].unique()

array(['Occupied housing units by average household size (%)',
       'Mean age at first marriage', 'Population Size'], dtype=object)

#### get the response rates

In [20]:
df_summary =final_df.groupby(['Indicator', 'Country', 'Year']).agg({'availability': 'max'}).reset_index()
df_summary.to_excel('availability_summary.xlsx', index=False)

In [21]:

def calculate_availability_percentage(group):
    # Count availability values (0s and 1s)
    counts = group['availability'].value_counts()

    # Calculate percentages
    percentages = (counts / 25) * 100

    # Map percentages back to each row
    group['availability_percentage'] = group['availability'].map(percentages)

    return group


availability_rr = df_summary.groupby(['Indicator', 'Country'], group_keys=False).apply(calculate_availability_percentage)
availability_rr.to_excel('availability_RR.xlsx', index=False)

availability_rr.head()

  availability_rr = df_summary.groupby(['Indicator', 'Country'], group_keys=False).apply(calculate_availability_percentage)


Unnamed: 0,Indicator,Country,Year,availability,availability_percentage
0,Mean age at first marriage,Bahrain,2010,1,20.0
1,Mean age at first marriage,Bahrain,2013,1,20.0
2,Mean age at first marriage,Bahrain,2016,1,20.0
3,Mean age at first marriage,Bahrain,2017,1,20.0
4,Mean age at first marriage,Bahrain,2019,1,20.0


#### create a table having all the years 2000 to 2025

In [22]:
#Create full list of years
years = list(range(2000, 2026))

#Get unique indicators and countries
indicators = availability_rr['Indicator'].unique()
countries = availability_rr['Country'].unique()

#Create the cartesian product of all (Indicator, Country, Year)
full_index = pd.DataFrame(list(product(indicators, countries, years)), columns=['Indicator', 'Country', 'Year'])

#Merge with the original data
availability_final = pd.merge(full_index, availability_rr, on=['Indicator', 'Country', 'Year'], how='left')

availability_final.to_excel('availability_final.xlsx', index=False)
