In [1]:
import pandas as pd
import numpy as np
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('pop_availability_test.csv')
#process Year
df['Year'] = df['Year'].str[1:].astype(int)
#create datapoint_available column
df['datapoint_available']=df['Value'].notna().astype(int)

#replace 'Not applicable' with Total  to apply groupby later on, and get rid of NAs
columns_to_replace = ['Age group', 'Area', 'Nationality']

# Replace "Not applicable" with "total" only in those columns
df[columns_to_replace] = df[columns_to_replace].replace('Not applicable', 'Total')
df['Sex'] = df['Sex'].replace('Not applicable', 'Both sexes')

df = df.dropna(subset=['Value'])

df.head()

Unnamed: 0,Indicator,Country,Age group,Area,Sex,Nationality,Year,Value,datapoint_available
0,Occupied housing units by average household si...,Iraq,Total,Total,Both sexes,Nationals,2021,6.4,1
1,Occupied housing units by average household si...,Iraq,Total,Total,Both sexes,Non-nationals,2021,5.6,1
2,Occupied housing units by average household si...,Kuwait,Total,Total,Both sexes,Nationals,2005,8.6,1
3,Occupied housing units by average household si...,Kuwait,Total,Total,Both sexes,Non-nationals,2005,5.6,1
4,Occupied housing units by average household si...,Kuwait,Total,Total,Both sexes,Nationals,2011,5.7,1


In [3]:
indicator_criteria={'Population Size':{'age_grouped':'no','aggregate':'Sex','keep':['Female','Male']}, 
          'Mean age at first marriage':{'age_grouped':'no','aggregate':'Sex','keep':['Female','Male']}, 
          'Occupied housing units by average household size (%)':{'age_grouped':'no','aggregate':'Area','keep':['Urban','Rural','Total']}
}

In [4]:
def availability(group):
    
    '''will go through each group and according to the indicator_criteria values if there is no age groups it will filter the group according to
    key keep and then calculate the count on nonNAs if its equal to the len(keep) then its a full avaialability coded 1 or 0 as partial availability
    if there are age group aggregation then we collapse the age groups by groupby and keeping first(Value) and continue same process as above'''

    # Initialize availability
    group['availability'] = 0

    #get the indicator value
    ind=group['Indicator'].iloc[0]  
    
    #get the arguments from the indicator_criteria
    criteria=indicator_criteria[ind]
    col=criteria['aggregate']
    to_keep=criteria['keep']
    cnt=len(criteria['keep'])
    age_grouped=criteria['age_grouped']

    if age_grouped=='no':
        df_filtered=group[group[col].isin(to_keep)].copy()
        #only keep the necessary columns, to avoid having multiple rows such as Nationality disaggregated and Sex disaggregated, collapse the one 
        #mentioned in the criteria['aggregate'] by groupby and taking the first
        df_filtered1=df_filtered[['Indicator','Country','Year',col, 'Value']]
        #collapse by col
        df_filtered2=df_filtered1.groupby(['Indicator','Country', 'Year'], as_index=False).agg({col:'first','Value': 'first'})

        #count the non na-s
        count = int(df_filtered['Value'].notna().sum())
        #availability= 1 for total response, 0 for partial response
        if count==cnt:
            availability=1
        else:
            availability=0

        df_filtered['availability']=availability

    #if it is agegroup disaggregated we need to collapse the rows for the age groups
    if age_grouped=='yes':
        df_filtered1=group[group[col].isin(to_keep)].copy()
        #only keep the necessary columns, to avoid having multiple rows such as Nationality disaggregated and Sex disaggregated, collapse the one 
        #mentioned in the criteria['aggregate'] by groupby and taking the first
        df_filtered2=df_filtered1[['Indicator','Country','Year',col, 'Value']]
        #collapse by col
        df_filtered3=df_filtered2.groupby(['Indicator','Country', 'Year'], as_index=False).agg({col:'first','Value': 'first'})
        #aggregate on agegroup
        df_filtered = df_filtered3.groupby(['Indicator','Country', 'Year', col], as_index=False).agg({'Value': 'first'})

        #count the non na-s
        count = int(df_filtered['Value'].notna().sum())
        #availability= 1 for total response, 0 for partial response
        if count==cnt:
            availability=1
        else:
            availability=0

        df_filtered['availability']=availability

    return df_filtered

In [5]:
   
df_grouped = df.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability)

df_grouped.to_excel('grouped.xlsx',index=False)

  df_grouped = df.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability)


In [6]:
df_grouped.head()

Unnamed: 0,Indicator,Country,Age group,Area,Sex,Nationality,Year,Value,datapoint_available,availability
315,Mean age at first marriage,Bahrain,Total,Total,Male,Total,2010,27,1,1
316,Mean age at first marriage,Bahrain,Total,Total,Female,Total,2010,23,1,1
317,Mean age at first marriage,Bahrain,Total,Total,Male,Total,2013,26,1,1
318,Mean age at first marriage,Bahrain,Total,Total,Female,Total,2013,23,1,1
320,Mean age at first marriage,Bahrain,Total,Total,Male,Total,2016,27,1,1


#### get the response rates

In [7]:
df_summary = (
    df_grouped.groupby(['Indicator', 'Country', 'Year'])
      .agg({'availability': 'max'})
      .reset_index()
)
df_summary.to_excel('availability_summary.xlsx', index=False)

In [9]:

def calculate_availability_percentage(group):
    # Count availability values (0s and 1s)
    counts = group['availability'].value_counts()

    # Calculate percentages
    percentages = (counts / 25) * 100

    # Map percentages back to each row
    group['availability_percentage'] = group['availability'].map(percentages)

    return group



availability_rr = df_summary.groupby(['Indicator', 'Country'], group_keys=False).apply(calculate_availability_percentage)
availability_rr.to_excel('availability_RR.xlsx', index=False)


availability_rr.head()

  availability_rr = df_summary.groupby(['Indicator', 'Country'], group_keys=False).apply(calculate_availability_percentage)


Unnamed: 0,Indicator,Country,Year,availability,availability_percentage
0,Mean age at first marriage,Bahrain,2010,1,20.0
1,Mean age at first marriage,Bahrain,2013,1,20.0
2,Mean age at first marriage,Bahrain,2016,1,20.0
3,Mean age at first marriage,Bahrain,2017,1,20.0
4,Mean age at first marriage,Bahrain,2019,1,20.0


In [None]:
# def availability(group,cnt):
    
#     # Initialize availability
#     group['availability'] = 0
#     #count the non na-s
#     count = int(group['Value'].notna().sum())
#     #availability= 1 for total response, 0 for partial response
#     if count==cnt:
#         availability=1
#     else:
#         availability=0

#     group['availability']=availability

#     return group

In [None]:
# indicators=list(df['Indicator'].unique())

# df_list=[]

# for ind in indicators:
#     #filter on the indicator
#     df_sub=df[df['Indicator']==ind].copy()
#     criteria=indicator_criteria[ind]

#     #get the column and value to filter on
#     col=criteria['aggregate']
#     to_keep=criteria['keep']
#     cnt=criteria['count']
#     age_grouped=criteria['age_grouped']
    
#     if age_grouped=='no':

#         df_filtered=df_sub[df_sub[col].isin(to_keep)].copy()
#         #group by sex and calculate total/partial availability, the group_keys=False not to include group labels as part of the index
#         df_grouped = df_filtered.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)
#         #append to the df_list
#         df_list.append(df_grouped)
    
#     if age_grouped=='yes':
#         df_filtered=df_sub[df_sub[col].isin(to_keep)].copy()
#         #aggregate on agegroup
#         df_grouped_age = df_filtered.groupby(['Indicator','Country', 'Year', col], as_index=False).agg({'Value': 'max'})
#         df_grouped = df_grouped_age.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)


# final_df = pd.concat(df_list, ignore_index=True)
# final_df.to_excel('grouped.xlsx',index=False)


  df_grouped = df_filtered.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)
  df_grouped = df_filtered.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)
  df_grouped = df_filtered.groupby(['Indicator','Country', 'Year'], group_keys=False).apply(availability,cnt)
