In [7]:
import pandas as pd
import numpy as np
import os

# Set options to display all rows and columns without truncation.
pd.options.display.max_rows = None
pd.options.display.max_columns = None

MAIN_DATA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/arabic_questionnaires.xlsx'
CRITERIA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/criterias.xlsx'

In [2]:
def create_criteria_dict(criteria_df, key_language='arabic'):
    """
    Creates a dictionary mapping indicator names to their availability criteria.
    """
    if key_language.lower() == 'english':
        key_col = 'Indicator_En'
    elif key_language.lower() == 'arabic':
        key_col = 'Indicator_Ar'
    else:
        raise ValueError("key_language must be 'english' or 'arabic'")
    criteria_df.dropna(subset=[key_col], inplace=True)
    return pd.Series(criteria_df.criteria.values, index=criteria_df[key_col]).to_dict()
    """
    Calculates availability for each group, returning a collapsed Series (one result per group).
    This function checks if data points for an indicator are present consistently across defined time windows.
    
    Example of the process for a single indicator group:
    1. indicator_name = group.name[...]
       This line just gets the name of the indicator we are working on.
       indicator_name = "Literacy rate"

    2. criteria = criteria_dict.get(indicator_name, 1)
       This looks up the "Literacy rate" in our criteria dictionary and finds its requirement.
       criteria = 2 (meaning we need at least 2 data points per 5-year window)

    3. binned_years = pd.cut(...)
       This is the categorization step. It takes our list of years and puts each one into a 5-year "bucket".
       2011 -> [2010, 2015), 2012 -> [2010, 2015)
       2016 -> [2015, 2020), 2018 -> [2015, 2020)
       2021 -> [2020, 2025), 2022 -> [2020, 2025), 2023 -> [2020, 2025)

    4. window_counts = binned_years.value_counts()
       This step counts how many data points landed in each bucket.
       [2010, 2015): 2
       [2015, 2020): 2
       [2020, 2025): 3

    5. windows_with_sufficient_data = window_counts[window_counts >= criteria]
       This is a filter. It keeps only buckets where the count meets our criteria (>= 2).
       [2010, 2015): Kept (because 2 >= 2)
       [2015, 2020): Kept (because 2 >= 2)
       [2020, 2025): Kept (because 3 >= 2)

    6. sufficient_windows_set = set(windows_with_sufficient_data.index)
       This creates a clean, unique list of the windows that passed the filter.
       sufficient_windows_set = { [2010, 2015), [2015, 2020), [2020, 2025) }

    7. return 1 if len(...) == len(...) else 0
       The final check compares the set of windows with sufficient data against the set of ALL possible windows in our universal time range.
       If they match perfectly, it means the indicator is fully available (returns 1), otherwise it's not (returns 0).
    """


In [3]:
#LOAD AND CLEAN DATA ---

try:
    main_df = pd.read_excel(MAIN_DATA_FILE)
    criteria_df = pd.read_excel(CRITERIA_FILE)
    print("Files read successfully.")
except FileNotFoundError as e:
    print(f"Error reading files: {e}. Make sure the paths are correct.")
    raise e

print("Cleaning source data...")
main_df = main_df.loc[:, ~main_df.columns.str.startswith('Unnamed')]
if 'Theme' in main_df.columns:
    main_df.rename(columns={'Theme': 'الفصل'}, inplace=True)
for col in main_df.select_dtypes(include=['object']).columns:
    main_df[col] = main_df[col].str.strip()

# Keep only rows with a non-null value before doing anything else
main_df = main_df[main_df['العدد'].notna()].copy()

# --- NEW BINNING LOGIC ---
def assign_bin_hardcoded(year):
    """Manually assigns a year to a specific, hardcoded bin."""
    if 2010 <= year < 2015:
        return '[2010-2015)'
    elif 2015 <= year < 2020:
        return '[2015-2020)'
    elif 2020 <= year <= 2025:  # NOTE: Includes 2025 as requested
        return '[2020-2025]'
    else:
        return np.nan


print("Creating year bins with hardcoded ranges...")
main_df['year_bins'] = main_df['السنة'].apply(assign_bin_hardcoded)
print("Data cleaning and binning complete.")
    
#get the criteria dictionary
criteria_dict_ar = create_criteria_dict(criteria_df, key_language='arabic')
print(f"Criteria dictionary created with {len(criteria_dict_ar)} entries.")

Files read successfully.
Cleaning source data...
Creating year bins with hardcoded ranges...
Data cleaning and binning complete.
Criteria dictionary created with 85 entries.


In [None]:
#CREATE MASTERFILE
print("\n--- Creating Detailed Masterfile ---")

# 1. Get all unique values for each of the key categorical columns from the main DataFrame.
unique_themes = main_df['الفصل'].unique()
unique_indicators = main_df['المؤشر'].unique()
unique_countries = main_df['الدولة'].unique()

print(f"Found {len(unique_themes)} unique themes, {len(unique_indicators)} indicators, and {len(unique_countries)} countries.")

# 2. Create the "blank file" by performing a Cartesian product of the unique categories.
# This creates a DataFrame with every possible combination of theme, indicator, and country.
masterfile = pd.MultiIndex.from_product(
    [unique_themes, unique_indicators, unique_countries],
    names=['الفصل', 'المؤشر', 'الدولة']
).to_frame(index=False)


In [9]:
#function to calculate availability for each group
def calculate_availability(group, criteria_dict):
    """
    Calculates availability for a single indicator/country group based on two conditions.

    Availability is 1 if and only if both conditions are met:
    1. The group contains data for ALL three of the required hardcoded time bins.
    2. The number of data points in EACH of those bins is >= the criteria from the dictionary.

    Args:
        group (pd.DataFrame): A DataFrame slice for a single indicator/country.
        criteria_dict (dict): A dictionary mapping indicator names to their criteria value (int).

    Returns:
        int: 1 if available, 0 if not available.
    """
    # Define the complete set of bins that must be present in the data.
    required_bins = {'[2010-2015)', '[2015-2020)', '[2020-2025]'}

    # Get necessary values from the input group
    indicator_name = group['المؤشر'].iloc[0]
    criteria = criteria_dict.get(indicator_name, 1)
    bins_in_data = set(group['year_bins'].unique())

    #Condition 1: Check if the set of bins in the data matches the required set
    bins_are_complete = (bins_in_data == required_bins)

    # Condition 2: Check if the count of data points in every bin meets the criteria
    # We only need to check this if the first condition is even potentially met.
    all_counts_are_sufficient = False
    if bins_are_complete:
        # Get the number of data points (rows) for each bin.
        counts_per_bin = group['year_bins'].value_counts()
        # The .all() method returns True only if every count meets the criteria.
        all_counts_are_sufficient = (counts_per_bin >= criteria).all()

    # Return 1 only if both conditions were met, otherwise return 0.
    if bins_are_complete and all_counts_are_sufficient:
        return 1
    else:
        return 0



#### Calculate availability fror each group

In [None]:
# Calculate Overall Group-Level Availability total
overall_availability_scores = main_df.groupby(['المؤشر', 'الدولة']).apply(
    calculate_availability,
    criteria_dict=criteria_dict_ar
).reset_index(name='التوفر الكلي') # Total Availability


#Merge the new scores into the Masterfile ---
masterfile = pd.merge(
    masterfile,
    overall_availability_scores,
    on=['المؤشر', 'الدولة'],
    how='left'
)

#General Datapoint Availability
# As per the plan: keep specific columns, then drop duplicates based on year.
df_general = main_df[['المؤشر', 'الدولة', 'السنة', 'العدد', 'year_bins']].copy()
df_general.drop_duplicates(subset=['المؤشر', 'الدولة', 'السنة'], inplace=True)
# For these unique year-rows, the datapoint is available. The value is implicitly not null.
df_general['توفر نقطة البيانات (كلي)'] = 1
# Select only the key columns and the new score for merging.
datapoint_scores_general = df_general[['المؤشر', 'الدولة', 'السنة', 'توفر نقطة البيانات (كلي)']]

# Merge general scores
masterfile = pd.merge(masterfile, datapoint_scores_general, on=['المؤشر', 'الدولة', 'السنة'], how='left')

####################################################################################
# Nationality Availability
# Filter for relevant nationality categories
df_nat_filtered = main_df[main_df['المواطنة'].isin(['مواطنون', 'غير مواطنين'])]

# Group by year and get the max value within the valid categories
df_nat_agg = df_nat_filtered.groupby(['المؤشر', 'الدولة', 'السنة']).agg(
    max_value=('العدد', 'max')
).reset_index()


nationality_availability_scores = df_nat_agg.groupby(['المؤشر', 'الدولة']).apply(
    calculate_availability,
    criteria_dict=criteria_dict_ar
).reset_index(name='التوفر حسب المواطنية')


#Merge the nationality availability scores
masterfile_df = pd.merge(
    masterfile,
    nationality_availability_scores,
    on=['المؤشر', 'الدولة'],
    how='left'
)

#Nationality Datapoint Availability
# Create the availability column: 1 if a max value was found, 0 otherwise
df_nat_agg['توفر نقطة البيانات (المواطنة)'] = np.where(df_nat_agg['max_value'].notna(), 1, 0)
# Select only the key columns and the new score for merging
datapoint_scores_nat = df_nat_agg[['المؤشر', 'الدولة', 'السنة', 'توفر نقطة البيانات (المواطنة)']]

# Merge nationality scores
masterfile = pd.merge(masterfile, datapoint_scores_nat, on=['المؤشر', 'الدولة', 'السنة'], how='left')

######################################################################################

#Area Datapoint Availability
df_area_filtered = main_df[main_df['المنطقة'].isin(['حضر', 'ريف'])]
df_area_agg = df_area_filtered.groupby(['المؤشر', 'الدولة', 'السنة']).agg(
    max_value=('العدد', 'max')
).reset_index()

area_availability_scores = df_area_agg.groupby(['المؤشر', 'الدولة']).apply(
    calculate_availability,
    criteria_dict=criteria_dict_ar
).reset_index(name='التوفر حسب المنطقة')


# Merge the area availability scores
masterfile = pd.merge(
    masterfile_df,
    area_availability_scores,
    on=['المؤشر', 'الدولة'],
    how='left'
)

df_area_agg['توفر نقطة البيانات (المنطقة)'] = np.where(df_area_agg['max_value'].notna(), 1, 0)
datapoint_scores_area = df_area_agg[['المؤشر', 'الدولة', 'السنة', 'توفر نقطة البيانات (المنطقة)']]

# Merge area scores
masterfile = pd.merge(masterfile, datapoint_scores_area, on=['المؤشر', 'الدولة', 'السنة'], how='left')

######################################################################################

masterfile.head(30)


In [58]:
# %% --- CELL 5: CALCULATE AVAILABILITY SCORES (CORRECTED LOGIC) ---

print("\n--- Calculating Availability Scores ---")

# The correct rule: Availability = 1 ONLY IF the indicator has data in all 3 defined time bins.

# --- General Availability ---
# Group by indicator/country and count the number of unique bins for each.
general_bin_counts = main_df.groupby(['المؤشر', 'الدولة'])['year_bins'].nunique()
# The final score is 1 only if the count of bins is exactly 3.
final_general_scores = (general_bin_counts == 3).astype(int)

# --- Nationality Availability ---
df_nationality = main_df[main_df['المواطنة'].isin(['مواطنون', 'غير مواطنين'])]
nat_bin_counts = df_nationality.groupby(['المؤشر', 'الدولة'])['year_bins'].nunique()
# The final score is 1 only if the count of bins is exactly 3.
final_nationality_scores = (nat_bin_counts == 3).astype(int)

# --- Area Availability ---
if 'المنطقة' in main_df.columns:
    df_area = main_df[main_df['المنطقة'].isin(['حضر', 'ريف'])]
    area_bin_counts = df_area.groupby(['المؤشر', 'الدولة'])['year_bins'].nunique()
    # The final score is 1 only if the count of bins is exactly 3.
    final_area_scores = (area_bin_counts == 3).astype(int)
else:
    final_area_scores = pd.Series(dtype=int)

# --- Combine the results ---
indicator_country_scores = pd.DataFrame({
    'التوفر كلي': final_general_scores,
    'التوفر حسب المواطنية': final_nationality_scores,
    'التوفر حسب المنطقة': final_area_scores
}).reset_index().fillna(0)

print("Availability scores calculated successfully.")
# You can now display `indicator_country_scores.head()` in a new cell to inspect the final result.


--- Calculating Availability Scores ---
Availability scores calculated successfully.


In [None]:
#GENERATE AGGREGATED REPORTS

print("\n--- Generating and Saving All Aggregated Reports ---")
availability_cols = ['التوفر كلي', 'التوفر حسب المواطنية', 'التوفر حسب المنطقة']

# --- File 1: main_availability.xlsx ---
# This file shows the final 0/1 availability for each indicator/country pair.
main_availability_agg_df = masterfile_df.groupby(['المؤشر', 'الفصل', 'الدولة']).agg({
    'التوفر كلي': 'max',
    'التوفر حسب المواطنية': 'max',
    'التوفر حسب المنطقة': 'max'
}).reset_index()
main_availability_agg_df.to_excel('main_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'main_availability.xlsx'")


# --- File 2: main_availability_percentage.xlsx ---
# This file shows the percentage of available indicators per theme and country.
long_availability_df = main_availability_agg_df.melt(
    id_vars=['المؤشر', 'الفصل', 'الدولة'],
    value_vars=availability_cols,
    var_name='نوع التوفر',
    value_name='متوفر'
)
total_indicators = main_df['المؤشر'].nunique()
availability_sums = long_availability_df.groupby(['الفصل', 'الدولة', 'نوع التوفر'])['متوفر'].apply(
    lambda x: (x.sum() / total_indicators) * 100 if total_indicators > 0 else 0
).reset_index(name='نسبة التوفر')
availability_sums = availability_sums[availability_sums['نوع التوفر'] == 'التوفر كلي']
availability_sums.to_excel('main_availability_percentage.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'main_availability_percentage.xlsx'")


# --- File 3: theme_country_availability.xlsx ---
# This file shows the number and percentage of available indicators for each theme/country.
indicators_per_theme = main_df.groupby('الفصل')['المؤشر'].nunique().reset_index(name='total_indicators_in_theme')
theme_country_sums = main_availability_agg_df.groupby(['الفصل', 'الدولة'])[availability_cols].sum().reset_index()
theme_country_agg_df = pd.merge(theme_country_sums, indicators_per_theme, on='الفصل', how='left')
for col in availability_cols:
    theme_country_agg_df[f'{col}_نسبة'] = (theme_country_agg_df[col] / theme_country_agg_df['total_indicators_in_theme']) * 100
theme_country_agg_df.to_excel('theme_country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'theme_country_availability.xlsx'")


# --- File 4: indicator_country_availability.xlsx ---
# This file shows the number and percentage of countries that have data for each indicator.
indicator_sums = main_availability_agg_df.groupby('المؤشر')[availability_cols].sum().reset_index()
total_countries = main_df['الدولة'].nunique()
if total_countries > 0:
    for col in availability_cols:
        indicator_sums[f'{col}_نسبة'] = (indicator_sums[col] / total_countries) * 100
indicator_sums.to_excel('indicator_country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'indicator_country_availability.xlsx'")


# --- File 5: country_availability.xlsx ---
# This file shows the number and percentage of available indicators for each country.
country_sums = main_availability_agg_df.groupby('الدولة')[availability_cols].sum().reset_index()
if total_indicators > 0:
    for col in availability_cols:
        country_sums[f'{col}_نسبة'] = (country_sums[col] / total_indicators) * 100
country_sums['التوفر السابق'] = ''
country_sums.to_excel('country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'country_availability.xlsx'")


# --- File 6 & 7: Heatmap Files ---
# Disaggregated Heatmap
heatmap_df_disaggregated = masterfile_df.groupby(
    ['المؤشر', 'الدولة', 'السنة', 'المواطنة', 'المنطقة', 'الفصل']
).agg({'التوفر كلي': 'max'}).reset_index()
heatmap_df_disaggregated.to_excel('heatmap_disaggregated.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'heatmap_disaggregated.xlsx'")

# Total (Aggregated) Heatmap
heatmap_df_total = masterfile_df.groupby(
    ['المؤشر', 'الدولة', 'السنة', 'الفصل']
).agg({'التوفر كلي': 'max'}).reset_index()
heatmap_df_total.to_excel('heatmap_total.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'heatmap_total.xlsx'")

print("\nAnalysis complete.")


--- Generating and Saving All Aggregated Reports ---
Successfully saved 'main_availability.xlsx'
Successfully saved 'main_availability_percentage.xlsx'
Successfully saved 'theme_country_availability.xlsx'
Successfully saved 'indicator_country_availability.xlsx'
Successfully saved 'country_availability.xlsx'
Successfully saved 'heatmap_disaggregated.xlsx'
Successfully saved 'heatmap_total.xlsx'

Analysis complete.
