In [1]:
import pandas as pd
import numpy as np
import os

MAIN_DATA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/arabic_questionnaires.xlsx'
CRITERIA_FILE = 'C:/Users/511232/Desktop/criterias.xlsx'

In [2]:
def create_criteria_dict(criteria_df, key_language='arabic'):
    """
    Creates a dictionary mapping indicator names to their availability criteria.

    Args:
        criteria_df (pd.DataFrame): DataFrame containing indicator names and criteria.
        key_language (str): 'english' or 'arabic'. Determines which indicator name to use as the key.

    Returns:
        dict: A dictionary mapping indicator names to their integer criteria.
    """
    if key_language.lower() == 'english':
        key_col = 'Indicator_En'
    elif key_language.lower() == 'arabic':
        key_col = 'Indicator_Ar'
    else:
        raise ValueError("key_language must be 'english' or 'arabic'")

    # Drop rows where the key column is NaN to avoid issues
    criteria_df.dropna(subset=[key_col], inplace=True)
    
    return pd.Series(criteria_df.criteria.values, index=criteria_df[key_col]).to_dict()


def calculate_availability(df, group_cols, criteria_dict, global_max_year, year_col='السنة', indicator_col='المؤشر', window_size=5):
    """
    Calculates availability for each group, returning a collapsed Series (one result per group).
    This function checks if data points for an indicator are present consistently across defined time windows.
    
    Example of the process for a single indicator group:
    1. indicator_name = group.name[...]
       This line just gets the name of the indicator we are working on.
       indicator_name = "Literacy rate"

    2. criteria = criteria_dict.get(indicator_name, 1)
       This looks up the "Literacy rate" in our criteria dictionary and finds its requirement.
       criteria = 2 (meaning we need at least 2 data points per 5-year window)

    3. binned_years = pd.cut(...)
       This is the categorization step. It takes our list of years and puts each one into a 5-year "bucket".
       2011 -> [2010, 2015), 2012 -> [2010, 2015)
       2016 -> [2015, 2020), 2018 -> [2015, 2020)
       2021 -> [2020, 2025), 2022 -> [2020, 2025), 2023 -> [2020, 2025)

    4. window_counts = binned_years.value_counts()
       This step counts how many data points landed in each bucket.
       [2010, 2015): 2
       [2015, 2020): 2
       [2020, 2025): 3

    5. windows_with_sufficient_data = window_counts[window_counts >= criteria]
       This is a filter. It keeps only buckets where the count meets our criteria (>= 2).
       [2010, 2015): Kept (because 2 >= 2)
       [2015, 2020): Kept (because 2 >= 2)
       [2020, 2025): Kept (because 3 >= 2)

    6. sufficient_windows_set = set(windows_with_sufficient_data.index)
       This creates a clean, unique list of the windows that passed the filter.
       sufficient_windows_set = { [2010, 2015), [2015, 2020), [2020, 2025) }

    7. return 1 if len(...) == len(...) else 0
       The final check compares the set of windows with sufficient data against the set of ALL possible windows in our universal time range.
       If they match perfectly, it means the indicator is fully available (returns 1), otherwise it's not (returns 0).
    """
    if df.empty:
        return pd.Series(dtype=int)

    # Determine the overall year range and create standard bins using the global max year.
    min_year = 2010
    bins = range(min_year, global_max_year + window_size + 1, window_size)
    
    # Create a set of all possible windows (bins) that could exist based on the global range.
    all_possible_windows = set(pd.cut(pd.Series(range(min_year, global_max_year + 1)), bins=bins, right=False).dropna().unique())
    
    results = {}
    grouped = df.groupby(group_cols)
    indicator_col_index = group_cols.index(indicator_col)

    for name, group in grouped:
        # 'name' is a tuple of the group keys, e.g., ('Indicator A', 'Country X')
        indicator_name = name[indicator_col_index]
        criteria = criteria_dict.get(indicator_name, 1)

        binned_years = pd.cut(group[year_col], bins=bins, right=False)
        window_counts = binned_years.value_counts()
        
        windows_with_sufficient_data = window_counts[window_counts >= criteria]
        sufficient_windows_set = set(windows_with_sufficient_data.index)
        
        results[name] = 1 if len(sufficient_windows_set) == len(all_possible_windows) else 0

    return pd.Series(results)

In [None]:
def main():
    """
    Main function to run the entire analysis pipeline.
    """
    # 1. Read and Clean Data
    try:
        main_df = pd.read_excel(MAIN_DATA_FILE)
        criteria_df = pd.read_excel(CRITERIA_FILE)
        print("Files read successfully.")
    except FileNotFoundError as e:
        print(f"Error reading files: {e}. Make sure they are in the correct directory.")
        return

    print("Cleaning source data...")
    main_df = main_df.loc[:, ~main_df.columns.str.startswith('Unnamed')]
    if 'Theme' in main_df.columns:
        main_df.rename(columns={'Theme': 'الفصل'}, inplace=True)
    for col in main_df.select_dtypes(include=['object']).columns:
        main_df[col] = main_df[col].str.strip()
    print("Data cleaning complete.")

    # 2. Prepare for Calculations
    criteria_dict_ar = create_criteria_dict(criteria_df, key_language='arabic')
    print(f"Criteria dictionary created with {len(criteria_dict_ar)} entries.")
    data_max_year = main_df['السنة'].max()
    print(f"Data maximum year is {data_max_year}. This will be used for availability calculation.")

    # 3. Calculate Availability Scores
    print("\n--- Step 1: Calculating Availability Scores ---")
    general_availability = calculate_availability(
        main_df[main_df['العدد'].notna()], ['المؤشر', 'الدولة'], criteria_dict_ar, data_max_year)
    nationality_availability = calculate_availability(
        main_df[main_df['العدد'].notna() & main_df['المواطنة'].notna() & ~main_df['المواطنة'].isin(['Not applicable', 'غير مطابق', 'Total'])],
        ['المؤشر', 'الدولة'], criteria_dict_ar, data_max_year)
    if 'المنطقة' in main_df.columns:
        area_availability = calculate_availability(
            main_df[main_df['العدد'].notna() & main_df['المنطقة'].notna() & ~main_df['المنطقة'].isin(['Not applicable', 'غير مطابق', 'Total'])],
            ['المؤشر', 'الدولة'], criteria_dict_ar, data_max_year)
    else:
        area_availability = pd.Series(dtype=int)

    indicator_country_scores = pd.DataFrame({
        'توفر كلي': general_availability,
        'توفر حسب المواطنية': nationality_availability,
        'توفر حسب المنطقة': area_availability
    }).reset_index()
    indicator_country_scores.rename(columns={'level_0': 'المؤشر', 'level_1': 'الدولة'}, inplace=True)
    indicator_country_scores.fillna(0, inplace=True)
    
    # --- Step 4: Create Masterfile (New Simplified Approach) ---
    print("\n--- Step 2: Creating Detailed Masterfile ---")
    """ 
    This section builds a complete grid for every combination of categories and years.
    This is essential for visualizations that need a value for every cell.
    """
    # Step A: Prepare a single data source with original data and calculated scores.
    # This includes the critical step of filling NaN in categorical columns to ensure merges work.
    source_data_with_scores = pd.merge(main_df, indicator_country_scores, on=['المؤشر', 'الدولة'], how='left')
    
    categorical_cols = ['الفصل', 'المؤشر', 'الدولة', 'المواطنة', 'المنطقة']
    existing_categorical_cols = [col for col in categorical_cols if col in source_data_with_scores.columns]
    
    # Fill NaNs with a placeholder string to make them mergeable
    for col in existing_categorical_cols:
        if source_data_with_scores[col].dtype == 'object':
            source_data_with_scores[col].fillna('N/A', inplace=True)

    # Step B: Create the "blank" master grid with all combinations and all years.
    unique_combinations = source_data_with_scores[existing_categorical_cols].drop_duplicates()
    print(f"Found {len(unique_combinations)} unique categorical combinations for the grid.")

    grid_max_year = max(data_max_year, 2025)
    all_years = pd.DataFrame({'السنة': range(2010, grid_max_year + 1)})
    
    complete_grid = unique_combinations.merge(all_years, how='cross')
    print(f"Created a complete grid with {len(complete_grid)} rows.")

    # Step C: Merge the prepared data onto the blank grid.
    merge_keys = existing_categorical_cols + ['السنة']
    masterfile_df = pd.merge(complete_grid, source_data_with_scores, on=merge_keys, how='left')
    
    availability_cols = ['توفر كلي', 'توفر حسب المواطنية', 'توفر حسب المنطقة']
    availability_cols_exist = [col for col in availability_cols if col in masterfile_df.columns]
    
    if availability_cols_exist:
        masterfile_df[availability_cols_exist] = masterfile_df.groupby(['المؤشر', 'الدولة'])[availability_cols_exist].ffill().bfill()
        masterfile_df[availability_cols_exist] = masterfile_df[availability_cols_exist].fillna(0).astype(int)

    final_master_cols = ['الفصل', 'المؤشر', 'الدولة', 'السنة', 'العدد', 'المواطنة', 'المنطقة'] + availability_cols
    cols_to_keep = [col for col in final_master_cols if col in masterfile_df.columns]
    masterfile_df = masterfile_df[cols_to_keep]

    masterfile_df.to_excel('masterfile_detailed_availability.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'masterfile_detailed_availability.xlsx'")

    # --- Step 5: Generate and Save All Aggregated Reports ---
    print("\n--- Step 3: Generating and Saving All Aggregated Reports ---")

    # File 1: main_availability.xlsx
    main_availability_agg_df = masterfile_df.groupby(['المؤشر', 'الفصل', 'الدولة']).agg({
        'توفر كلي': 'max',
        'توفر حسب المواطنية': 'max',
        'توفر حسب المنطقة': 'max'
    }).reset_index()
    main_availability_agg_df.to_excel('main_availability.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'main_availability.xlsx'")

    # Create a long-format DataFrame from the aggregated availability data
    long_availability_df = main_availability_agg_df.melt(
        id_vars=['المؤشر', 'الفصل', 'الدولة'],
        value_vars=['توفر كلي', 'توفر حسب المواطنية', 'توفر حسب المنطقة'],
        var_name='نوع التوفر',
        value_name='متوفر'
    )
    
    # Create a long-format DataFrame from the aggregated availability data
    long_availability_df = main_availability_agg_df.melt(
        id_vars=['المؤشر', 'الفصل', 'الدولة'],
        value_vars=['توفر كلي', 'توفر حسب المواطنية', 'توفر حسب المنطقة'],
        var_name='نوع التوفر',
        value_name='متوفر'
    )
    
    # Get the total number of unique indicators in the entire dataset to use as the denominator
    total_indicators = main_df['المؤشر'].nunique()

    # Group by theme, country, and availability type, then use a lambda function with apply
    # to calculate the percentage of available indicators over the total count in a single step.
    availability_sums = long_availability_df.groupby(['الفصل', 'الدولة', 'نوع التوفر'])['متوفر'].apply(
        lambda x: (x.sum() / total_indicators) * 100 if total_indicators > 0 else 0
    ).reset_index(name='نسبة التوفر')

    # Filter to keep only the 'توفر كلي' (Total Availability) type as requested
    availability_sums = availability_sums[availability_sums['نوع التوفر'] == 'توفر كلي']

    # Save the new aggregated file
    availability_sums.to_excel('main_availability_percentage.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'main_availability_percentage.xlsx'")


    # File 2: theme_country_availability.xlsx
    indicators_per_theme = main_df.groupby('الفصل')['المؤشر'].nunique().reset_index().rename(columns={'المؤشر': 'total_indicators_in_theme'})
    theme_country_sums = main_availability_agg_df.groupby(['الفصل', 'الدولة'])[availability_cols].sum().reset_index()
    theme_country_agg_df = pd.merge(theme_country_sums, indicators_per_theme, on='الفصل', how='left')
    for col in availability_cols:
        theme_country_agg_df[col + '_نسبة'] = (theme_country_agg_df[col] / theme_country_agg_df['total_indicators_in_theme']) * 100
    theme_country_agg_df.to_excel('theme_country_availability.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'theme_country_availability.xlsx'")

    # File 3: indicator_country_availability.xlsx
    indicator_country_agg_df = main_availability_agg_df.groupby(['المؤشر', 'الدولة'])[availability_cols].max().reset_index()
    # Create a separate dataframe for the percentage calculation to be saved.
    # Sum the availabilities for each indicator to count how many countries have it.
    indicator_sums = indicator_country_agg_df.groupby('المؤشر')[availability_cols].sum().reset_index()
    
    # Get the total number of unique countries in the dataset
    total_countries = main_df['الدولة'].nunique()
    
    # Calculate the percentage over the total number of countries
    if total_countries > 0:
        for col in availability_cols:
            indicator_sums[col + '_نسبة'] = (indicator_sums[col] / total_countries) * 100
            
    indicator_sums.to_excel('indicator_country_availability.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'indicator_country_availability.xlsx'")
    
    # File 4: country_availability.xlsx
    total_indicators = main_df['المؤشر'].nunique()
    country_sums = indicator_country_agg_df.groupby('الدولة')[availability_cols].sum().reset_index()
    if total_indicators > 0:
        for col in availability_cols:
            country_sums[col + '_نسبة'] = (country_sums[col] / total_indicators) * 100
    country_sums.to_excel('country_availability.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'country_availability.xlsx'")

    # File 5: heatmap_disaggregated.xlsx
    heatmap_agg_dict = {
        'توفر كلي': 'max',
        'توفر حسب المواطنية': 'max',
        'توفر حسب المنطقة': 'max',
        'الفصل': 'first'
    }
    heatmap_group_cols = ['المؤشر', 'الدولة', 'السنة', 'المواطنة', 'المنطقة']
    existing_heatmap_cols = [col for col in heatmap_group_cols if col in masterfile_df.columns]
    heatmap_df = masterfile_df.groupby(existing_heatmap_cols).agg(heatmap_agg_dict).reset_index()
    heatmap_df.to_excel('heatmap_disaggregated.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'heatmap.xlsx'")

    # File 6: heatmap_total.xlsx
    heatmap_agg_dict = {
        'توفر كلي': 'max',
        'توفر حسب المواطنية': 'max',
        'توفر حسب المنطقة': 'max',
        'الفصل': 'first'
    }
    heatmap_group_cols = ['المؤشر', 'الدولة', 'السنة']
    existing_heatmap_cols = [col for col in heatmap_group_cols if col in masterfile_df.columns]
    heatmap_df = masterfile_df.groupby(existing_heatmap_cols).agg(heatmap_agg_dict).reset_index()
    heatmap_df.to_excel('heatmap_total.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'heatmap_total.xlsx'")
    
    # File 7: themes.xlsx
    print("\n--- Generating Theme-level Country Coverage Report ---")
    
    # Use the main_availability_agg_df which has 0/1 for availability per indicator/theme/country
    # First, determine if a country has ANY available indicator for a given theme by taking the max
    theme_country_any_avail = main_availability_agg_df.groupby(['الفصل', 'الدولة'])[availability_cols].max().reset_index()
    
    # Now, sum these 0/1 flags. The sum represents the count of countries that have data for each theme.
    theme_sums = theme_country_any_avail.groupby('الفصل')[availability_cols].sum().reset_index()
    
    # Get the total number of unique countries in the entire dataset to use as the denominator
    total_countries = main_df['الدولة'].nunique()
    
    # Calculate the percentage over the total number of countries
    if total_countries > 0:
        for col in availability_cols:
            theme_sums[col + '_نسبة'] = (theme_sums[col] / total_countries) * 100
            
    theme_sums.to_excel('themes.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'themes.xlsx'")
    # --- END OF CHANGE ---

    print("\nAnalysis complete.")


if __name__ == '__main__':
    main()

Files read successfully.
Cleaning source data...
Data cleaning complete.
Criteria dictionary created with 8 entries.
Data maximum year is 2024. This will be used for availability calculation.

--- Step 1: Calculating Availability Scores ---

--- Step 2: Creating Detailed Masterfile ---
Found 21 unique categorical combinations for the grid.
Created a complete grid with 336 rows.
Successfully saved 'masterfile_detailed_availability.xlsx'

--- Step 3: Generating and Saving All Aggregated Reports ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  source_data_with_scores[col].fillna('N/A', inplace=True)


Successfully saved 'main_availability.xlsx'
Successfully saved 'main_availability_percentage.xlsx'
Successfully saved 'theme_country_availability.xlsx'
Successfully saved 'indicator_country_availability.xlsx'
Successfully saved 'country_availability.xlsx'
Successfully saved 'heatmap.xlsx'
Successfully saved 'heatmap_total.xlsx'

--- Generating Theme-level Country Coverage Report ---
Successfully saved 'themes.xlsx'

Analysis complete.
