In [31]:
import pandas as pd
import numpy as np
import os

MAIN_DATA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/arabic_questionnaires.xlsx'
CRITERIA_FILE = 'C:/Users/511232/Desktop/criterias.xlsx'

In [32]:
def create_criteria_dict(criteria_df, key_language='arabic'):
    """
    Creates a dictionary mapping indicator names to their availability criteria.

    Args:
        criteria_df (pd.DataFrame): DataFrame containing indicator names and criteria.
        key_language (str): 'english' or 'arabic'. Determines which indicator name to use as the key.

    Returns:
        dict: A dictionary mapping indicator names to their integer criteria.
    """
    if key_language.lower() == 'english':
        key_col = 'Indicator_En'
    elif key_language.lower() == 'arabic':
        key_col = 'Indicator_Ar'
    else:
        raise ValueError("key_language must be 'english' or 'arabic'")

    # Drop rows where the key column is NaN to avoid issues
    criteria_df.dropna(subset=[key_col], inplace=True)
    
    return pd.Series(criteria_df.criteria.values, index=criteria_df[key_col]).to_dict()


def calculate_availability(df, group_cols, criteria_dict, global_max_year, year_col='السنة', indicator_col='المؤشر', window_size=5):
    """
    Calculates availability for each group, returning a collapsed Series (one result per group).
    This function checks if data points for an indicator are present consistently across defined time windows.
    
    Example of the process for a single indicator group:
    1. indicator_name = group.name[...]
       This line just gets the name of the indicator we are working on.
       indicator_name = "Literacy rate"

    2. criteria = criteria_dict.get(indicator_name, 1)
       This looks up the "Literacy rate" in our criteria dictionary and finds its requirement.
       criteria = 2 (meaning we need at least 2 data points per 5-year window)

    3. binned_years = pd.cut(...)
       This is the categorization step. It takes our list of years and puts each one into a 5-year "bucket".
       2011 -> [2010, 2015), 2012 -> [2010, 2015)
       2016 -> [2015, 2020), 2018 -> [2015, 2020)
       2021 -> [2020, 2025), 2022 -> [2020, 2025), 2023 -> [2020, 2025)

    4. window_counts = binned_years.value_counts()
       This step counts how many data points landed in each bucket.
       [2010, 2015): 2
       [2015, 2020): 2
       [2020, 2025): 3

    5. windows_with_sufficient_data = window_counts[window_counts >= criteria]
       This is a filter. It keeps only buckets where the count meets our criteria (>= 2).
       [2010, 2015): Kept (because 2 >= 2)
       [2015, 2020): Kept (because 2 >= 2)
       [2020, 2025): Kept (because 3 >= 2)

    6. sufficient_windows_set = set(windows_with_sufficient_data.index)
       This creates a clean, unique list of the windows that passed the filter.
       sufficient_windows_set = { [2010, 2015), [2015, 2020), [2020, 2025) }

    7. return 1 if len(...) == len(...) else 0
       The final check compares the set of windows with sufficient data against the set of ALL possible windows in our universal time range.
       If they match perfectly, it means the indicator is fully available (returns 1), otherwise it's not (returns 0).
    """
    if df.empty:
        return pd.Series(dtype=int)

    # Determine the overall year range and create standard bins using the global max year.
    min_year = 2010
    bins = range(min_year, global_max_year + window_size + 1, window_size)
    
    # Create a set of all possible windows (bins) that could exist based on the global range.
    all_possible_windows = set(pd.cut(pd.Series(range(min_year, global_max_year + 1)), bins=bins, right=False).dropna().unique())
    
    results = {}
    grouped = df.groupby(group_cols)
    indicator_col_index = group_cols.index(indicator_col)

    for name, group in grouped:
        # 'name' is a tuple of the group keys, e.g., ('Indicator A', 'Country X')
        indicator_name = name[indicator_col_index]
        criteria = criteria_dict.get(indicator_name, 1)

        binned_years = pd.cut(group[year_col], bins=bins, right=False)
        window_counts = binned_years.value_counts()
        
        windows_with_sufficient_data = window_counts[window_counts >= criteria]
        sufficient_windows_set = set(windows_with_sufficient_data.index)
        
        results[name] = 1 if len(sufficient_windows_set) == len(all_possible_windows) else 0

    return pd.Series(results)

In [None]:
def main():
    """
    Main function to run the entire analysis pipeline.
    """
    # 1. Read and Clean Data
    try:
        main_df = pd.read_excel(MAIN_DATA_FILE)
        criteria_df = pd.read_excel(CRITERIA_FILE)
        print("Files read successfully.")
    except FileNotFoundError as e:
        print(f"Error reading files: {e}. Make sure they are in the correct directory.")
        return

    print("Cleaning source data...")
    main_df = main_df.loc[:, ~main_df.columns.str.startswith('Unnamed')]
    for col in main_df.select_dtypes(include=['object']).columns:
        main_df[col] = main_df[col].str.strip()
    print("Data cleaning complete.")

    # 2. Prepare for Calculations
    criteria_dict_ar = create_criteria_dict(criteria_df, key_language='arabic')
    print(f"Criteria dictionary created with {len(criteria_dict_ar)} entries.")
    
    # Use the actual max year from the data for the availability calculation
    data_max_year = main_df['السنة'].max()
    print(f"Data maximum year is {data_max_year}. This will be used for availability calculation.")

    # 3. Calculate Collapsed Availability Scores
    print("\n--- Step 1: Calculating Availability Scores ---")
    
    general_availability = calculate_availability(
        main_df[main_df['العدد'].notna()], ['المؤشر', 'الدولة'], criteria_dict_ar, data_max_year
    )
    
    nationality_availability = calculate_availability(
        main_df[main_df['العدد'].notna() & main_df['المواطنة'].notna() & ~main_df['المواطنة'].isin(['Not applicable', 'غير مطابق', 'Total'])],
        ['المؤشر', 'الدولة'], criteria_dict_ar, data_max_year
    )

    if 'المنطقة' in main_df.columns:
        area_availability = calculate_availability(
            main_df[main_df['العدد'].notna() & main_df['المنطقة'].notna() & ~main_df['المنطقة'].isin(['Not applicable', 'غير مطابق', 'Total'])],
            ['المؤشر', 'الدولة'], criteria_dict_ar, data_max_year
        )
    else:
        area_availability = pd.Series(dtype=int)

    # 4. Create a clean DataFrame of Indicator-Country scores
    indicator_country_scores = pd.DataFrame({
        'Availability total': general_availability,
        'Availability by nationality': nationality_availability,
        'Availability by area': area_availability
    }).reset_index()
    indicator_country_scores.rename(columns={'level_0': 'المؤشر', 'level_1': 'الدولة'}, inplace=True)
    indicator_country_scores.fillna(0, inplace=True)
    
    # 5. Create main_availability.xlsx (Original data + availability scores)
    print("\n--- Step 2: Creating main_availability.xlsx ---")
    
    main_availability_df = pd.merge(main_df, indicator_country_scores, on=['المؤشر', 'الدولة'], how='left')
    
    # Define and select final columns for output
    final_cols = ['المؤشر', 'الدولة', 'السنة', 'العدد', 'المواطنة', 'المنطقة', 
                  'Availability total', 'Availability by nationality', 'Availability by area']
    # Ensure optional columns exist before selecting
    cols_to_keep = [col for col in final_cols if col in main_availability_df.columns]
    main_availability_df = main_availability_df[cols_to_keep]
    
    main_availability_df.to_excel('main_availability.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'main_availability.xlsx'")

    # 6. Create masterfile_detailed_availability.xlsx (Heatmap-ready file)
    print("\n--- Step 3: Creating masterfile_detailed_availability.xlsx ---")

    """ 
    This section builds a complete grid for every combination of categories and years.
    This is essential for visualizations like heatmaps that need a value for every cell.
    
    Example of the process:
    1.  source_df (Input Data):
        المؤشر 	        الدولة 	السنة 	 العدد
        النمو السكاني 	 مصر   2020 	 2.5
        النمو السكاني 	 تونس 	2021 	 1.1
        البطالة         مصر   2021 	 7.5

    2.  unique_combinations:
        A DataFrame containing every unique combination of Indicator, Country, Area, and Nationality
        that exists in the original data is created. This forms the base for our grid.

    3.  A full range of years is created as a separate DataFrame.

    4.  Cross Join:
        A 'cross' merge is performed between the unique combinations and the years. This creates the
        "Cartesian product" - a new DataFrame with a row for every combination for every year.

    5.  Final Merge:
        The data from 'main_availability.xlsx' (which includes 'العدد' and availability scores) is
        merged onto this complete grid. This fills in the data for existing points and leaves
        NaN for the new year rows we've created.

    6.  Filling Scores:
        The availability scores for the new empty rows are filled in using forward and backward fill
        to ensure consistency.
    """
    
    # Define the columns that make up a unique entity in your data
    categorical_cols = ['المؤشر', 'الدولة']
    if 'المنطقة' in main_df.columns: categorical_cols.append('المنطقة')
    if 'المواطنة' in main_df.columns: categorical_cols.append('المواطنة')

    # Get all unique combinations of these categories from the main data
    # Drop rows where any of these key identifiers are missing
    unique_combinations = main_df[categorical_cols].drop_duplicates().dropna()
    print(f"Found {len(unique_combinations)} unique categorical combinations for the grid.")

    # Determine the max year for the grid. Use the data's max year or 2025, whichever is greater.
    grid_max_year = max(data_max_year, 2025)
    print(f"Grid will be built up to year {grid_max_year}.")
    
    # Create a DataFrame with a full range of years for the grid
    min_year = 2010
    all_years = pd.DataFrame({'السنة': range(min_year, grid_max_year + 1)})

    # Create the complete grid via a cross join
    complete_grid = unique_combinations.merge(all_years, how='cross')
    print(f"Created a complete grid with {len(complete_grid)} rows.")
    
    # Merge the actual data (from main_availability_df) onto the complete grid
    # The merge keys are all the categorical columns plus the year
    merge_keys = categorical_cols + ['السنة']
    masterfile_df = pd.merge(complete_grid, main_availability_df, on=merge_keys, how='left')
    print(f"Master file shape after merging data: {masterfile_df.shape}")

    # Forward-fill and back-fill the availability scores to populate the empty year rows
    availability_cols = ['Availability total', 'Availability by nationality', 'Availability by area']
    availability_cols_exist = [col for col in availability_cols if col in masterfile_df.columns]
    
    if availability_cols_exist:
        masterfile_df[availability_cols_exist] = masterfile_df.groupby(['المؤشر', 'الدولة'])[availability_cols_exist].ffill().bfill()
        masterfile_df[availability_cols_exist] = masterfile_df[availability_cols_exist].fillna(0).astype(int)

    masterfile_df = masterfile_df[cols_to_keep] # Use the same column selection as before
    masterfile_df.to_excel('masterfile_detailed_availability.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'masterfile_detailed_availability.xlsx'")

    # 7. Perform and save aggregations
    print("\n--- Step 4: Generating Aggregated Reports ---")
    indicator_country_availability_df = indicator_country_scores.copy() # Use the clean scores df
    
    output_indicator_country = 'indicator_country_availability.xlsx'
    indicator_country_availability_df.to_excel(output_indicator_country, index=False, engine='openpyxl')
    print(f"Saved indicator-country level availability to '{output_indicator_country}'")

    agg_cols = [col for col in availability_cols if col in indicator_country_availability_df.columns]
    country_agg = indicator_country_availability_df.groupby('الدولة')[agg_cols].sum()
    total_indicators = main_df['المؤشر'].nunique()
    if total_indicators > 0:
        country_availability_pct = (country_agg / total_indicators) * 100
    else:
        country_availability_pct = country_agg
    country_availability_pct.rename(columns=lambda c: c + '_pct', inplace=True)
    country_availability_pct.reset_index().to_excel('country_availability.xlsx', index=False, engine='openpyxl')
    print("Saved country level availability percentages to 'country_availability.xlsx'")

    print("\n--- Regional Availability Summary ---")
    if not indicator_country_availability_df.empty:
        regional_sums = indicator_country_availability_df[agg_cols].sum()
        regional_pct = (regional_sums / len(indicator_country_availability_df)) * 100
        print("Percentage of available indicators for the whole region:")
        print(regional_pct)

    # 8. Generate heatmap-specific aggregated files from the masterfile
    print("\n--- Step 5: Generating Heatmap-Specific Aggregated Files ---")
    
    # File 1: Total availability per indicator, country, and year
    heatmap_total_df = masterfile_df.groupby(['المؤشر', 'الدولة', 'السنة'])['Availability total'].max().reset_index()
    heatmap_total_df.to_excel('heatmap_total.xlsx', index=False, engine='openpyxl')
    print("Successfully saved 'heatmap_total.xlsx'")

    # File 2: Nationality availability
    if 'المواطنة' in masterfile_df.columns and 'Availability by nationality' in masterfile_df.columns:
        heatmap_nationality_df = masterfile_df.groupby(['المؤشر', 'الدولة', 'المواطنة', 'السنة'])['Availability by nationality'].max().reset_index()
        heatmap_nationality_df.to_excel('heatmap_nationality.xlsx', index=False, engine='openpyxl')
        print("Successfully saved 'heatmap_nationality.xlsx'")
    else:
        print("Skipping 'heatmap_nationality.xlsx' due to missing columns.")

    # File 3: Area availability
    if 'المنطقة' in masterfile_df.columns and 'Availability by area' in masterfile_df.columns:
        heatmap_area_df = masterfile_df.groupby(['المؤشر', 'الدولة', 'المنطقة', 'السنة'])['Availability by area'].max().reset_index()
        heatmap_area_df.to_excel('heatmap_area.xlsx', index=False, engine='openpyxl')
        print("Successfully saved 'heatmap_area.xlsx'")
    else:
        print("Skipping 'heatmap_area.xlsx' due to missing columns.")
    
    print("\nAnalysis complete.")


if __name__ == '__main__':
    main()



Files read successfully.
Cleaning source data...
Data cleaning complete.
Criteria dictionary created with 8 entries.
Data maximum year is 2024. This will be used for availability calculation.

--- Step 1: Calculating Availability Scores ---

--- Step 2: Creating main_availability.xlsx ---
Successfully saved 'main_availability.xlsx'

--- Step 3: Creating masterfile_detailed_availability.xlsx ---
Found 6 unique categorical combinations for the grid.
Grid will be built up to year 2025.
Created a complete grid with 96 rows.
Master file shape after merging data: (106, 9)
Successfully saved 'masterfile_detailed_availability.xlsx'

--- Step 4: Generating Aggregated Reports ---
Saved indicator-country level availability to 'indicator_country_availability.xlsx'
Saved country level availability percentages to 'country_availability.xlsx'

--- Regional Availability Summary ---
Percentage of available indicators for the whole region:
Availability total             100.0
Availability by nationality 