In [32]:
import pandas as pd
import numpy as np
import os

# Set options to display all rows and columns without truncation.
pd.options.display.max_rows = None
pd.options.display.max_columns = None

MAIN_DATA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/arabic_questionnaires.xlsx'
CRITERIA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/criterias.xlsx'

In [33]:
def create_criteria_dict(criteria_df, key_language='arabic'):
    """
    Creates a dictionary mapping indicator names to their availability criteria.
    """
    if key_language.lower() == 'english':
        key_col = 'Indicator_En'
    elif key_language.lower() == 'arabic':
        key_col = 'Indicator_Ar'
    else:
        raise ValueError("key_language must be 'english' or 'arabic'")
    criteria_df.dropna(subset=[key_col], inplace=True)
    return pd.Series(criteria_df.criteria.values, index=criteria_df[key_col]).to_dict()
    """
    Calculates availability for each group, returning a collapsed Series (one result per group).
    This function checks if data points for an indicator are present consistently across defined time windows.
    
    Example of the process for a single indicator group:
    1. indicator_name = group.name[...]
       This line just gets the name of the indicator we are working on.
       indicator_name = "Literacy rate"

    2. criteria = criteria_dict.get(indicator_name, 1)
       This looks up the "Literacy rate" in our criteria dictionary and finds its requirement.
       criteria = 2 (meaning we need at least 2 data points per 5-year window)

    3. binned_years = pd.cut(...)
       This is the categorization step. It takes our list of years and puts each one into a 5-year "bucket".
       2011 -> [2010, 2015), 2012 -> [2010, 2015)
       2016 -> [2015, 2020), 2018 -> [2015, 2020)
       2021 -> [2020, 2025), 2022 -> [2020, 2025), 2023 -> [2020, 2025)

    4. window_counts = binned_years.value_counts()
       This step counts how many data points landed in each bucket.
       [2010, 2015): 2
       [2015, 2020): 2
       [2020, 2025): 3

    5. windows_with_sufficient_data = window_counts[window_counts >= criteria]
       This is a filter. It keeps only buckets where the count meets our criteria (>= 2).
       [2010, 2015): Kept (because 2 >= 2)
       [2015, 2020): Kept (because 2 >= 2)
       [2020, 2025): Kept (because 3 >= 2)

    6. sufficient_windows_set = set(windows_with_sufficient_data.index)
       This creates a clean, unique list of the windows that passed the filter.
       sufficient_windows_set = { [2010, 2015), [2015, 2020), [2020, 2025) }

    7. return 1 if len(...) == len(...) else 0
       The final check compares the set of windows with sufficient data against the set of ALL possible windows in our universal time range.
       If they match perfectly, it means the indicator is fully available (returns 1), otherwise it's not (returns 0).
    """


In [40]:
#LOAD AND CLEAN DATA
try:
    main_df = pd.read_excel(MAIN_DATA_FILE)
    criteria_df = pd.read_excel(CRITERIA_FILE)
    print("Files read successfully.")
except FileNotFoundError as e:
    print(f"Error reading files: {e}. Make sure the paths are correct.")
    raise e

main_df.rename(columns={'Theme': 'الفصل'}, inplace=True)

# Keep only rows with a non-null value before doing anything else
main_df = main_df[main_df['العدد'].notna()].copy()

# --- NEW BINNING LOGIC ---
def assign_bin_hardcoded(year):
    """Manually assigns a year to a specific, hardcoded bin."""
    if 2010 <= year < 2015:
        return '[2010-2015)'
    elif 2015 <= year < 2020:
        return '[2015-2020)'
    elif 2020 <= year <= 2025:  # NOTE: Includes 2025 as requested
        return '[2020-2025]'
    else:
        return np.nan


print("Creating year bins with hardcoded ranges...")
main_df['year_bins'] = main_df['السنة'].apply(assign_bin_hardcoded)
print("Data cleaning and binning complete.")
    
#get the criteria dictionary
criteria_dict_ar = create_criteria_dict(criteria_df, key_language='arabic')
print(f"Criteria dictionary created with {len(criteria_dict_ar)} entries.")

Files read successfully.
Creating year bins with hardcoded ranges...
Data cleaning and binning complete.
Criteria dictionary created with 85 entries.


In [42]:
def calculate_availability(group, criteria_dict):
    """Calculates availability for a single indicator/country group."""
    required_bins = {'[2010-2015)', '[2015-2020)', '[2020-2025]'}
    if 'year_bins' not in group.columns or group.empty:
        return 0
    indicator_name = group['المؤشر'].iloc[0]
    criteria = criteria_dict.get(indicator_name, 1)
    bins_in_data = set(group['year_bins'].dropna().unique())
    bins_are_complete = (bins_in_data == required_bins)
    all_counts_are_sufficient = False
    if bins_are_complete:
        counts_per_bin = group['year_bins'].value_counts()
        all_counts_are_sufficient = (counts_per_bin >= criteria).all()
    if bins_are_complete and all_counts_are_sufficient:
        return 1
    else:
        return 0

In [44]:
#  Calculate ALL Availability Scores
#Prepare Filtered DataFrames for Disaggregations
df_nat_filtered = main_df[main_df['المواطنة'].isin(['مواطنون', 'غير مواطنين'])]
df_area_filtered = main_df[main_df['المنطقة'].isin(['حضر', 'ريف'])]
#####################################################################################

#Group-Level Availability Scores
overall_availability_scores = main_df.groupby(['المؤشر', 'الدولة']).apply(
    calculate_availability, criteria_dict=criteria_dict_ar
).reset_index(name='التوفر الكلي')

nationality_availability_scores = df_nat_filtered.groupby(['المؤشر', 'الدولة']).apply(
    calculate_availability, criteria_dict=criteria_dict_ar
).reset_index(name='التوفر حسب المواطنية')

area_availability_scores = df_area_filtered.groupby(['المؤشر', 'الدولة']).apply(
    calculate_availability, criteria_dict=criteria_dict_ar
    ).reset_index(name='التوفر حسب المنطقة')

#########################################################################################

#Datapoint-Level Availability Scores
datapoint_scores_general = main_df.groupby(['المؤشر', 'الدولة', 'السنة']).size().reset_index(name='توفر نقطة البيانات (كلي)')
datapoint_scores_general['توفر نقطة البيانات (كلي)'] = 1

datapoint_scores_nat = df_nat_filtered.groupby(['المؤشر', 'الدولة', 'السنة']).size().reset_index(name='توفر نقطة البيانات (المواطنة)')
datapoint_scores_nat['توفر نقطة البيانات (المواطنة)'] = 1


datapoint_scores_area = df_area_filtered.groupby(['المؤشر', 'الدولة', 'السنة']).size().reset_index(name='توفر نقطة البيانات (المنطقة)')
datapoint_scores_area['توفر نقطة البيانات (المنطقة)'] = 1

  overall_availability_scores = main_df.groupby(['المؤشر', 'الدولة']).apply(
  nationality_availability_scores = df_nat_filtered.groupby(['المؤشر', 'الدولة']).apply(
  area_availability_scores = df_area_filtered.groupby(['المؤشر', 'الدولة']).apply(


In [None]:
# Create a Proper Masterfile Grid (Corrected)
print("Creating masterfile grid and merging all scores...")

# 1. Get the unique combinations of Theme, Indicator, and Country that ACTUALLY EXIST in your data.
valid_combinations = main_df[['الفصل', 'المؤشر', 'الدولة']].drop_duplicates()
print(f"Found {len(valid_combinations)} valid combinations of Theme, Indicator, and Country.")

# 2. Get the full list of years.
all_years = sorted(main_df['السنة'].unique())

# 3. Create the masterfile grid by performing a 'cross' merge between the valid combinations and all years.
# This is the correct way to build the grid without creating invalid rows.
masterfile_df = pd.merge(valid_combinations, pd.DataFrame({'السنة': all_years}), how='cross')

print(f"Created a clean masterfile grid with {len(masterfile_df)} rows.")

Creating masterfile grid and merging all scores...


In [46]:
# Merge Everything into the Masterfile

# Merge original data values first.
masterfile_df = pd.merge(masterfile_df, main_df, on=['الفصل', 'المؤشر', 'الدولة', 'السنة'], how='left')
# Merge the group-level scores.
masterfile_df = pd.merge(masterfile_df, overall_availability_scores, on=['المؤشر', 'الدولة'], how='left')
masterfile_df = pd.merge(masterfile_df, nationality_availability_scores, on=['المؤشر', 'الدولة'], how='left')
masterfile_df = pd.merge(masterfile_df, area_availability_scores, on=['المؤشر', 'الدولة'], how='left')

# Merge the datapoint-level scores.
masterfile_df = pd.merge(masterfile_df, datapoint_scores_general, on=['المؤشر', 'الدولة', 'السنة'], how='left')
masterfile_df = pd.merge(masterfile_df, datapoint_scores_nat, on=['المؤشر', 'الدولة', 'السنة'], how='left')
masterfile_df = pd.merge(masterfile_df, datapoint_scores_area, on=['المؤشر', 'الدولة', 'السنة'], how='left')

# 1. Define all the score columns you want to clean.
score_cols = [
    'التوفر الكلي',
    'التوفر حسب المواطنية',
    'التوفر حسب المنطقة',
    'توفر نقطة البيانات (كلي)',
    'توفر نقطة البيانات (المواطنة)',
    'توفر نقطة البيانات (المنطقة)'
]

# 2. Loop through the list and fill NaNs with 0 for each column found.
for col in score_cols:
    # Check if the column exists in the DataFrame to avoid errors
    if col in masterfile_df.columns:
        # .fillna(0, inplace=True) replaces all empty values in the column with 0
        masterfile_df[col].fillna(0, inplace=True)
        # Convert the column to integer type for clean data
        masterfile_df[col] = masterfile_df[col].astype(int)


#save the masterfile
cols_to_keep=['الفصل', 'المؤشر', 'الدولة', 'السنة',  'المواطنة',
       'العدد', 'المنطقة',  'التوفر الكلي',
       'التوفر حسب المواطنية', 'التوفر حسب المنطقة',
       'توفر نقطة البيانات (كلي)', 'توفر نقطة البيانات (المواطنة)',
       'توفر نقطة البيانات (المنطقة)']
grouping_keys = [
    'الفصل', 'المؤشر', 'الدولة', 'السنة',
    'المواطنة', 'المنطقة'
]

masterfile_disaggregated = masterfile_df[cols_to_keep]
masterfile_disaggregated.drop_duplicates(subset=grouping_keys, inplace=True)

masterfile_disaggregated.to_excel('masterfile_disaggregated.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'masterfile_disaggregated.xlsx'")

#save the masterfile
cols_to_keep=['الفصل', 'المؤشر', 'الدولة', 'السنة',
       'العدد', 'التوفر الكلي',
       'التوفر حسب المواطنية', 'التوفر حسب المنطقة',
       'توفر نقطة البيانات (كلي)', 'توفر نقطة البيانات (المواطنة)',
       'توفر نقطة البيانات (المنطقة)']
grouping_keys = [
    'الفصل', 'المؤشر', 'الدولة', 'السنة'
]

masterfile = masterfile_df[cols_to_keep]
masterfile.drop_duplicates(subset=grouping_keys, inplace=True)

masterfile.to_excel('masterfile.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'masterfile.xlsx'")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  masterfile_df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  masterfile_df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

Successfully saved 'masterfile_disaggregated.xlsx'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  masterfile.drop_duplicates(subset=grouping_keys, inplace=True)


Successfully saved 'masterfile.xlsx'


In [47]:
masterfile_df.columns

Index(['الفصل', 'المؤشر', 'الدولة', 'السنة', 'المواطنة', 'العدد', 'المصدر',
       'المنطقة', 'الجنس', 'الفئة العمرية', 'الحالة الزوجية',
       'التصنيف الدولي لاسباب الوفاة', 'سبب الوفاة',
       'أسباب البقاء خارج القوى العاملة', 'وضع العمالة',
       'أقسام النشاط الإقتصادي', 'القطاع المؤسسي', 'أقسام المهن الرئيسية',
       'نوع مكان الإقامة', 'نوع حيازة الوحدات السكنية', 'مصدر مياه الشرب',
       'أنواع نظام التخلص من مياه الصرف الصحي', 'مصدر الإضاءة',
       'المرحلة التعليمية', 'الفئة', 'نوع الخدمات/المنتجات', 'القطاع',
       'year_bins', 'التوفر الكلي', 'التوفر حسب المواطنية',
       'التوفر حسب المنطقة', 'توفر نقطة البيانات (كلي)',
       'توفر نقطة البيانات (المواطنة)', 'توفر نقطة البيانات (المنطقة)'],
      dtype='object')

In [48]:
#GENERATE AGGREGATED REPORTS

print("\n--- Generating and Saving All Aggregated Reports ---")
availability_cols = ['التوفر الكلي', 'التوفر حسب المواطنية', 'التوفر حسب المنطقة']

# --- File 1: main_availability.xlsx ---
# This file shows the final 0/1 availability for each indicator/country pair.
main_availability_agg_df = masterfile_df.groupby(['المؤشر', 'الفصل', 'الدولة']).agg({
    'التوفر الكلي': 'max',
    'التوفر حسب المواطنية': 'max',
    'التوفر حسب المنطقة': 'max'
}).reset_index()
main_availability_agg_df.to_excel('main_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'main_availability.xlsx'")


# --- File 2: main_availability_percentage.xlsx ---
# This file shows the percentage of available indicators per theme and country.
long_availability_df = main_availability_agg_df.melt(
    id_vars=['المؤشر', 'الفصل', 'الدولة'],
    value_vars=availability_cols,
    var_name='نوع التوفر',
    value_name='متوفر'
)
total_indicators = main_df['المؤشر'].nunique()
availability_sums = long_availability_df.groupby(['الفصل', 'الدولة', 'نوع التوفر'])['متوفر'].apply(
    lambda x: (x.sum() / total_indicators) * 100 if total_indicators > 0 else 0
).reset_index(name='نسبة التوفر')
availability_sums = availability_sums[availability_sums['نوع التوفر'] == 'التوفر كلي']
availability_sums.to_excel('main_availability_percentage.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'main_availability_percentage.xlsx'")


# --- File 3: theme_country_availability.xlsx ---
# This file shows the number and percentage of available indicators for each theme/country.
indicators_per_theme = main_df.groupby('الفصل')['المؤشر'].nunique().reset_index(name='total_indicators_in_theme')
theme_country_sums = main_availability_agg_df.groupby(['الفصل', 'الدولة'])[availability_cols].sum().reset_index()
theme_country_agg_df = pd.merge(theme_country_sums, indicators_per_theme, on='الفصل', how='left')
for col in availability_cols:
    theme_country_agg_df[f'{col}_نسبة'] = (theme_country_agg_df[col] / theme_country_agg_df['total_indicators_in_theme']) * 100
theme_country_agg_df.to_excel('theme_country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'theme_country_availability.xlsx'")


# --- File 4: indicator_country_availability.xlsx ---
# This file shows the number and percentage of countries that have data for each indicator.
indicator_sums = main_availability_agg_df.groupby('المؤشر')[availability_cols].sum().reset_index()
total_countries = main_df['الدولة'].nunique()
if total_countries > 0:
    for col in availability_cols:
        indicator_sums[f'{col}_نسبة'] = (indicator_sums[col] / total_countries) * 100
indicator_sums.to_excel('indicator_country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'indicator_country_availability.xlsx'")


# --- File 5: country_availability.xlsx ---
# This file shows the number and percentage of available indicators for each country.
country_sums = main_availability_agg_df.groupby('الدولة')[availability_cols].sum().reset_index()
if total_indicators > 0:
    for col in availability_cols:
        country_sums[f'{col}_نسبة'] = (country_sums[col] / total_indicators) * 100
country_sums['التوفر السابق'] = ''
country_sums.to_excel('country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'country_availability.xlsx'")


--- Generating and Saving All Aggregated Reports ---
Successfully saved 'main_availability.xlsx'
Successfully saved 'main_availability_percentage.xlsx'
Successfully saved 'theme_country_availability.xlsx'
Successfully saved 'indicator_country_availability.xlsx'
Successfully saved 'country_availability.xlsx'


In [51]:
main_df[main_df['المؤشر']=='نسبة العمالة إلى السكان']['الفصل'].unique()

array(['العمالة'], dtype=object)

In [52]:
masterfile[masterfile['المؤشر']=='نسبة العمالة إلى السكان']['الفصل'].unique()

array(['الصحة', 'السكان', 'العمالة', 'السكن', 'التعليم', 'الفقر'],
      dtype=object)