In [13]:
import pandas as pd
import numpy as np
import os

MAIN_DATA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/arabic_questionnaires.xlsx'
CRITERIA_FILE = 'C:/Users/511232/Desktop/criterias.xlsx'

In [14]:
def create_criteria_dict(criteria_df, key_language='arabic'):
    """
    Creates a dictionary mapping indicator names to their availability criteria.

    Args:
        criteria_df (pd.DataFrame): DataFrame containing indicator names and criteria.
        key_language (str): 'english' or 'arabic'. Determines which indicator name to use as the key.

    Returns:
        dict: A dictionary mapping indicator names to their integer criteria.
    """
    if key_language.lower() == 'english':
        key_col = 'Indicator_En'
    elif key_language.lower() == 'arabic':
        key_col = 'Indicator_Ar'
    else:
        raise ValueError("key_language must be 'english' or 'arabic'")

    # Drop rows where the key column is NaN to avoid issues
    criteria_df.dropna(subset=[key_col], inplace=True)
    
    return pd.Series(criteria_df.criteria.values, index=criteria_df[key_col]).to_dict()


def calculate_availability(df, group_cols, criteria_dict, global_max_year, year_col='السنة', indicator_col='المؤشر', window_size=5):
    """
    Calculates availability for each group, returning a collapsed Series (one result per group).
    This function checks if data points for an indicator are present consistently across defined time windows.
    
    Example of the process for a single indicator group:
    1. indicator_name = group.name[...]
       This line just gets the name of the indicator we are working on.
       indicator_name = "Literacy rate"

    2. criteria = criteria_dict.get(indicator_name, 1)
       This looks up the "Literacy rate" in our criteria dictionary and finds its requirement.
       criteria = 2 (meaning we need at least 2 data points per 5-year window)

    3. binned_years = pd.cut(...)
       This is the categorization step. It takes our list of years and puts each one into a 5-year "bucket".
       2011 -> [2010, 2015), 2012 -> [2010, 2015)
       2016 -> [2015, 2020), 2018 -> [2015, 2020)
       2021 -> [2020, 2025), 2022 -> [2020, 2025), 2023 -> [2020, 2025)

    4. window_counts = binned_years.value_counts()
       This step counts how many data points landed in each bucket.
       [2010, 2015): 2
       [2015, 2020): 2
       [2020, 2025): 3

    5. windows_with_sufficient_data = window_counts[window_counts >= criteria]
       This is a filter. It keeps only buckets where the count meets our criteria (>= 2).
       [2010, 2015): Kept (because 2 >= 2)
       [2015, 2020): Kept (because 2 >= 2)
       [2020, 2025): Kept (because 3 >= 2)

    6. sufficient_windows_set = set(windows_with_sufficient_data.index)
       This creates a clean, unique list of the windows that passed the filter.
       sufficient_windows_set = { [2010, 2015), [2015, 2020), [2020, 2025) }

    7. return 1 if len(...) == len(...) else 0
       The final check compares the set of windows with sufficient data against the set of ALL possible windows in our universal time range.
       If they match perfectly, it means the indicator is fully available (returns 1), otherwise it's not (returns 0).
    """
    if df.empty:
        return pd.Series(dtype=int)

    # Determine the overall year range and create standard bins using the global max year.
    min_year = 2010
    bins = range(min_year, global_max_year + window_size + 1, window_size)
    
    # Create a set of all possible windows (bins) that could exist based on the global range.
    all_possible_windows = set(pd.cut(pd.Series(range(min_year, global_max_year + 1)), bins=bins, right=False).dropna().unique())
    
    results = {}
    grouped = df.groupby(group_cols)
    indicator_col_index = group_cols.index(indicator_col)

    for name, group in grouped:
        # 'name' is a tuple of the group keys, e.g., ('Indicator A', 'Country X')
        indicator_name = name[indicator_col_index]
        criteria = criteria_dict.get(indicator_name, 1)

        binned_years = pd.cut(group[year_col], bins=bins, right=False)
        window_counts = binned_years.value_counts()
        
        windows_with_sufficient_data = window_counts[window_counts >= criteria]
        sufficient_windows_set = set(windows_with_sufficient_data.index)
        
        results[name] = 1 if len(sufficient_windows_set) == len(all_possible_windows) else 0

    return pd.Series(results)

In [15]:
def main():
    """
    Main function to run the entire analysis pipeline.
    """
    # 1. Read in the main excel file and the criteria file
    try:
        main_df = pd.read_excel(MAIN_DATA_FILE)
        criteria_df = pd.read_excel(CRITERIA_FILE)
        print("Files read successfully.")
    except FileNotFoundError as e:
        print(f"Error reading files: {e}. Make sure they are in the correct directory.")
        return

    # 2. Clean the raw data immediately after loading
    print("Cleaning source data...")
    
    # Remove any extra 'Unnamed' columns that may have been created by Excel.
    main_df = main_df.loc[:, ~main_df.columns.str.startswith('Unnamed')]
    
    # Strip leading/trailing whitespace from all text columns to prevent merge errors.
    # This is a critical step for data consistency.
    for col in main_df.select_dtypes(include=['object']).columns:
        main_df[col] = main_df[col].str.strip()
    print("Data cleaning complete.")

    # 3. Create the criteria dictionary (using Arabic names to match the main file)
    criteria_dict_ar = create_criteria_dict(criteria_df, key_language='arabic')
    print(f"Criteria dictionary created with {len(criteria_dict_ar)} entries.")
    
    # Determine the global maximum year from the original dataframe once.
    global_max_year = main_df['السنة'].max()
    print(f"Global maximum year found in data: {global_max_year}")

    # 4. Calculate the three availability columns
    print("\n--- Step 1: Calculating Availability ---")
    
    # General availability
    print("Calculating general availability...")
    general_df = main_df[main_df['العدد'].notna()].copy()
    general_availability = calculate_availability(
        general_df, group_cols=['المؤشر', 'الدولة'], criteria_dict=criteria_dict_ar, global_max_year=global_max_year
    )
    
    # Nationality availability
    print("Calculating nationality availability...")
    nationality_df = main_df[
        main_df['العدد'].notna() & main_df['المواطنة'].notna() & ~main_df['المواطنة'].isin(['Not applicable', 'غير مطابق', 'Total'])
    ].copy()
    nationality_availability = calculate_availability(
        nationality_df, group_cols=['المؤشر', 'الدولة'], criteria_dict=criteria_dict_ar, global_max_year=global_max_year
    )

    # Area availability
    print("Calculating area availability...")
    if 'المنطقة' in main_df.columns:
        area_df_filtered = main_df[
            main_df['العدد'].notna() & main_df['المنطقة'].notna() & ~main_df['المنطقة'].isin(['Not applicable', 'غير مطابق', 'Total'])
        ].copy()
        area_availability = calculate_availability(
            area_df_filtered, group_cols=['المؤشر', 'الدولة'], criteria_dict=criteria_dict_ar, global_max_year=global_max_year
        )
    else:
        print("Warning: 'المنطقة' (Area) column not found. Area availability will be empty.")
        area_availability = pd.Series(dtype=int)

    # 5. Create the indicator-country availability table
    master_indicators = main_df[['المؤشر', 'الدولة']].drop_duplicates().reset_index(drop=True)
    
    df_general = general_availability.reset_index(name='general_availability')
    df_general.rename(columns={'level_0': 'المؤشر', 'level_1': 'الدولة'}, inplace=True)

    df_nationality = nationality_availability.reset_index(name='nationality_availability')
    df_nationality.rename(columns={'level_0': 'المؤشر', 'level_1': 'الدولة'}, inplace=True)
    
    indicator_country_availability_df = pd.merge(master_indicators, df_general, on=['المؤشر', 'الدولة'], how='left')
    indicator_country_availability_df = pd.merge(indicator_country_availability_df, df_nationality, on=['المؤشر', 'الدولة'], how='left')
    
    if not area_availability.empty:
        df_area = area_availability.reset_index(name='area_availability')
        df_area.rename(columns={'level_0': 'المؤشر', 'level_1': 'الدولة'}, inplace=True)
        indicator_country_availability_df = pd.merge(indicator_country_availability_df, df_area, on=['المؤشر', 'الدولة'], how='left')
    else:
        indicator_country_availability_df['area_availability'] = np.nan

    indicator_country_availability_df.fillna(0, inplace=True)
    for col in ['general_availability', 'nationality_availability', 'area_availability']:
        if col in indicator_country_availability_df.columns:
            indicator_country_availability_df[col] = indicator_country_availability_df[col].astype(int)

    # 6. Create the non-collapsed master file with a complete grid of years for the heatmap
    print("\n--- Step 2: Creating Detailed Master File for Heatmap (Non-Collapsed View) ---")
    
    """ 
    This section builds a complete grid for every combination of categories and years.
    This is essential for visualizations like heatmaps that need a value for every cell.
    
    Example of the process:
    1.  source_df (Input Data):
        المؤشر 	        الدولة 	السنة 	 العدد
        النمو السكاني 	 مصر   2020 	 2.5
        النمو السكاني 	 تونس 	2021 	 1.1
        البطالة         مصر   2021 	 7.5

    2.  unique_values_iterables:
        A list of unique values from each categorical column is created.
        e.g., [['النمو السكاني', 'البطالة'], ['مصر', 'تونس']]

    3.  A full range of years is added:
        e.g., [['النمو السكاني', 'البطالة'], ['مصر', 'تونس'], range(2010, 2022)]

    4.  pd.MultiIndex.from_product(...):
        This creates the "Cartesian product" - every possible combination of the items in the lists.
        e.g., (النمو السكاني, مصر, 2010), (النمو السكاني, مصر, 2011), ... etc.
        The 'names' parameter is crucial here; it names the levels of the index so that when we convert
        it to a DataFrame, the columns get the correct headers automatically.

    5.  pd.DataFrame(...).reset_index():
        This converts the MultiIndex into a standard DataFrame, creating the complete grid.
        
    6.  pd.merge(...):
        The original data and the availability scores are merged onto this grid, filling in the
        'العدد' values where they exist and leaving NaN where there are gaps.
    """

    source_df = main_df.copy()
    if 'المواطنة' in source_df.columns:
        source_df = source_df[~source_df['المواطنة'].isin(['غير مطابق'])]
    if 'المنطقة' in source_df.columns:
        source_df = source_df[~source_df['المنطقة'].isin(['غير مطابق'])]

    categorical_cols = ['المؤشر', 'الدولة']
    if 'المنطقة' in source_df.columns:
        categorical_cols.append('المنطقة')
    if 'المواطنة' in source_df.columns:
        categorical_cols.append('المواطنة')

    unique_values_iterables = [source_df[col].unique() for col in categorical_cols]
    min_year = 2010
    unique_values_iterables.append(range(min_year, global_max_year + 1))
    
    multi_index = pd.MultiIndex.from_product(unique_values_iterables, names=categorical_cols + ['السنة'])
    complete_grid = pd.DataFrame(index=multi_index).reset_index()
    print(f"Created a complete grid with {len(complete_grid)} rows (combinations x years).")

    # Merge the actual data onto the complete grid.
    masterfile_df = pd.merge(complete_grid, source_df, on=categorical_cols + ['السنة'], how='left')
    
    # Merge the pre-calculated availability scores.
    masterfile_df = pd.merge(masterfile_df, indicator_country_availability_df, on=['المؤشر', 'الدولة'], how='left')

    availability_cols = ['general_availability', 'nationality_availability', 'area_availability']
    for col in availability_cols:
        if col in masterfile_df.columns:
            masterfile_df[col] = masterfile_df[col].fillna(0)
            masterfile_df[col] = masterfile_df[col].astype(int)

    output_masterfile = 'masterfile_detailed_availability.xlsx'
    masterfile_df.to_excel(output_masterfile, index=False, engine='openpyxl')
    print(f"Successfully saved detailed master file with complete year grid to '{output_masterfile}'")

    # 7. Perform and save various aggregations from the cleaned data
    print("\n--- Step 3: Generating Aggregated Reports ---")
    
    # Aggregation by Indicator and Country - simply save the dataframe we already built.
    output_indicator_country = 'indicator_country_availability.xlsx'
    indicator_country_availability_df.to_excel(output_indicator_country, index=False, engine='openpyxl')
    print(f"Saved indicator-country level availability to '{output_indicator_country}'")

    # - Aggregation by Country (Percentage)
    country_agg = indicator_country_availability_df.groupby('الدولة')[['general_availability', 'nationality_availability', 'area_availability']].sum()
    total_indicators = main_df['المؤشر'].nunique()
    if total_indicators > 0:
        country_availability_pct = (country_agg / total_indicators) * 100
    else:
        country_availability_pct = country_agg
    country_availability_pct.rename(columns=lambda c: c + '_pct', inplace=True)
    output_country = 'country_availability.xlsx'
    country_availability_pct.reset_index().to_excel(output_country, index=False, engine='openpyxl')
    print(f"Saved country level availability percentages to '{output_country}'")

    # - Aggregation for the whole region (Percentage)
    print("\n--- Regional Availability Summary ---")
    total_indicator_country_pairs = len(indicator_country_availability_df)
    if total_indicator_country_pairs > 0:
        regional_sums = indicator_country_availability_df[['general_availability', 'nationality_availability', 'area_availability']].sum()
        regional_pct = (regional_sums / total_indicator_country_pairs) * 100
        print("Percentage of available indicators for the whole region:")
        print(regional_pct)
    else:
        print("No indicator-country pairs to calculate regional availability.")
    
    print("\nAnalysis complete.")


if __name__ == '__main__':
    main()



Files read successfully.
Cleaning source data...
Data cleaning complete.
Criteria dictionary created with 8 entries.
Global maximum year found in data: 2024

--- Step 1: Calculating Availability ---
Calculating general availability...
Calculating nationality availability...
Calculating area availability...

--- Step 2: Creating Detailed Master File for Heatmap (Non-Collapsed View) ---
Created a complete grid with 0 rows (combinations x years).
Successfully saved detailed master file with complete year grid to 'masterfile_detailed_availability.xlsx'

--- Step 3: Generating Aggregated Reports ---
Saved indicator-country level availability to 'indicator_country_availability.xlsx'
Saved country level availability percentages to 'country_availability.xlsx'

--- Regional Availability Summary ---
Percentage of available indicators for the whole region:
general_availability        100.0
nationality_availability      0.0
area_availability            50.0
dtype: float64

Analysis complete.
