In [28]:
import pandas as pd
import numpy as np
import os

In [29]:
MAIN_DATA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/arabic_questionnaires.xlsx'
CRITERIA_FILE = 'C:/Users/511232/Desktop/criterias.xlsx'

In [30]:
def create_criteria_dict(criteria_df, key_language='arabic'):
    """
    Creates a dictionary mapping indicator names to their availability criteria.

    Args:
        criteria_df (pd.DataFrame): DataFrame containing indicator names and criteria.
        key_language (str): 'english' or 'arabic'. Determines which indicator name to use as the key.

    Returns:
        dict: A dictionary mapping indicator names to their integer criteria.
    """
    if key_language.lower() == 'english':
        key_col = 'Indicator_En'
    elif key_language.lower() == 'arabic':
        key_col = 'Indicator_Ar'
    else:
        raise ValueError("key_language must be 'english' or 'arabic'")

    # Drop rows where the key column is NaN to avoid issues
    criteria_df.dropna(subset=[key_col], inplace=True)
    
    return pd.Series(criteria_df.criteria.values, index=criteria_df[key_col]).to_dict()

In [31]:
def calculate_availability(df, group_cols, criteria_dict, global_max_year, year_col='السنة', indicator_col='المؤشر', window_size=5):
    """
    1. indicator_name = group.name[...]
    This line just gets the name of the indicator we are working on.
    indicator_name = "Literacy rate"

    2. criteria = criteria_dict.get(indicator_name, 1)
    This looks up the "Literacy rate" in our criteria dictionary and finds its requirement.
    criteria = 2

    3. binned_years = pd.cut(...)
    This is the categorization step. It takes our list of years and puts each one into a 5-year "bucket".

    2011 -> [2010, 2015)
    2012 -> [2010, 2015)
    2016 -> [2015, 2020)
    2018 -> [2015, 2020)
    2021 -> [2020, 2025)
    2022 -> [2020, 2025)
    2023 -> [2020, 2025)

    4. window_counts = binned_years.value_counts()
    This step counts how many data points landed in each bucket.

    [2010, 2015): 2
    [2015, 2020): 2
    [2020, 2025): 3

    5. windows_with_sufficient_data = window_counts[window_counts >= criteria]
    This is a filter. It looks at our counts and keeps only the buckets where the count is greater than or equal to our criteria (which is 2).

    [2010, 2015): Kept (because 2 >= 2)
    [2015, 2020): Kept (because 2 >= 2)
    [2020, 2025): Kept (because 3 >= 2)

    6. sufficient_windows_set = set(windows_with_sufficient_data.index)
    This creates a clean, unique list of the windows that passed the filter.
    sufficient_windows_set = { [2010, 2015), [2015, 2020), [2020, 2025) }

    7. return 1 if len(...) == len(...) else 0
    """
    if df.empty:
        return pd.Series(dtype=int)

    # Determine the overall year range and create standard bins using the global max year.
    # This ensures all calculations use the same universal time period.
    min_year = 2010
    bins = range(min_year, global_max_year + window_size + 1, window_size)
    
    # Create a set of all possible windows (bins) that could exist based on the global range.
    all_possible_windows = set(pd.cut(pd.Series(range(min_year, global_max_year + 1)), bins=bins, right=False).dropna().unique())
  

    def check_group(group):
        '''group_cols.index(indicator_col):
        This finds the position (the index) of our indicator column within the list of grouping columns.

        group_cols is ['المؤشر', 'الدولة'].
        The position of 'المؤشر' in this list is 0.

        group.name:
        When pandas groups data, the .name attribute of each group is a tuple containing the values for that specific group.
        Since our group is for "حجم السكان حسب المواطنة" and "تونس", the group.name will be: ('حجم السكان حسب المواطنة', 'تونس').

        Putting it together: group.name[0]:

        The code now effectively becomes ('حجم السكان حسب المواطنة', 'تونس')[0].
        This retrieves the item at position 0 from the tuple.
        Result: The variable indicator_name is now set to the string 'حجم السكان حسب المواطنة'.'''
        
        # Find the name of the current indicator being processed.
        indicator_name = group.name[group_cols.index(indicator_col)]
        # Get the criteria for the specific indicator, default to 1 if not found
        criteria = criteria_dict.get(indicator_name, 1)

        binned_years = pd.cut(group[year_col], bins=bins, right=False)
        window_counts = binned_years.value_counts()
        
        # Find windows that meet or exceed the criteria
        windows_with_sufficient_data = window_counts[window_counts >= criteria]
        sufficient_windows_set = set(windows_with_sufficient_data.index)
        
        # If all possible windows are present in the set of sufficient windows, it's available
        return 1 if len(sufficient_windows_set) == len(all_possible_windows) else 0

    return df.groupby(group_cols).apply(check_group)

In [None]:
def main():
    
    """
    Main function to run the entire analysis pipeline.
    """
    # 1. Read in the main excel file and the criteria file
    try:
        # Using the path variables defined at the top of the script
        main_df = pd.read_excel(MAIN_DATA_FILE)
        criteria_df = pd.read_excel(CRITERIA_FILE)
        print("Files read successfully.")
    except FileNotFoundError as e:
        print(f"Error reading files: {e}. Make sure they are in the correct directory.")
        return

    # 2. Create the criteria dictionary (using Arabic names to match the main file)
    criteria_dict_ar = create_criteria_dict(criteria_df, key_language='arabic')
    print(f"Criteria dictionary created with {len(criteria_dict_ar)} entries.")
    
    # Determine the global maximum year from the original dataframe once.
    global_max_year = main_df['السنة'].max()
    print(f"Global maximum year found in data: {global_max_year}")

    # 3. Calculate the three availability columns, ensuring 'العدد' (Value) is not null.
    
    # General availability - now considers only rows with a valid value
    print("Calculating general availability...")
    general_df = main_df[main_df['العدد'].notna()].copy()
    general_availability = calculate_availability(
        general_df,
        group_cols=['المؤشر', 'الدولة'],
        criteria_dict=criteria_dict_ar,
        global_max_year=global_max_year
    )
    
    # Nationality availability - now also checks for a valid value
    print("Calculating nationality availability...")
    # Filter for valid nationality data and a non-null value before calculating
    nationality_df = main_df[
        main_df['العدد'].notna() &
        main_df['المواطنة'].notna() & 
        ~main_df['المواطنة'].isin(['Not applicable', 'غير مطابق', 'Total'])
    ].copy()
    nationality_availability = calculate_availability(
        nationality_df,
        group_cols=['المؤشر', 'الدولة'],
        criteria_dict=criteria_dict_ar,
        global_max_year=global_max_year
    )

    # Area availability - now also checks for a valid value
    print("Calculating area availability...")
    # Assuming the area column is named 'المنطقة'. If it exists:
    if 'المنطقة' in main_df.columns:
        area_df_filtered = main_df[
            main_df['العدد'].notna() &
            main_df['المنطقة'].notna() & 
            ~main_df['المنطقة'].isin(['Not applicable', 'غير مطابق', 'Total'])
        ].copy()
        area_availability = calculate_availability(
            area_df_filtered,
            group_cols=['المؤشر', 'الدولة'],
            criteria_dict=criteria_dict_ar,
            global_max_year=global_max_year
        )
    else:
        print("Warning: 'المنطقة' (Area) column not found. Area availability will be empty.")
        # Create an empty series to avoid errors later
        area_availability = pd.Series(dtype=int)
    

    # 4. Create the resulting table by starting with a master list of all indicators.

    # First, create a master list of all unique Indicator/Country pairs from the original data file.
    # This ensures no indicator is ever dropped from the final output.
    master_indicators = main_df[['المؤشر', 'الدولة']].drop_duplicates().reset_index(drop=True)
    print(f"Found {len(master_indicators)} unique indicator-country pairs to process.")

    # Convert the calculated series to DataFrames for merging.
    df_general = general_availability.reset_index(name='general_availability')
    df_nationality = nationality_availability.reset_index(name='nationality_availability')
    
    # Merge the results onto the master list. This guarantees all indicators are kept.
    final_df = pd.merge(master_indicators, df_general, on=['المؤشر', 'الدولة'], how='left')
    final_df = pd.merge(final_df, df_nationality, on=['المؤشر', 'الدولة'], how='left')
    
    # This block now safely handles the merge for area availability.
    if not area_availability.empty:
        df_area = area_availability.reset_index(name='area_availability')
        final_df = pd.merge(final_df, df_area, on=['المؤشر', 'الدولة'], how='left')
    else:
        # If there's no area data, just create the column with a placeholder.
        final_df['area_availability'] = np.nan 

    # Fill any NaN values that resulted from the merge with 0 (since they are not available)
    final_df.fillna(0, inplace=True)
    # Convert availability columns to integers
    for col in ['general_availability', 'nationality_availability', 'area_availability']:
        if col in final_df.columns:
            final_df[col] = final_df[col].astype(int)

    print("Final DataFrame created:")
    try:
        print(final_df.head())
    except UnicodeEncodeError:
        print("\nNOTE: Could not display DataFrame head in the console due to character encoding issues.")
        print("This is a common issue with non-English characters on Windows terminals.")
        print("The data has been processed correctly and will be saved to the Excel file.")


    # 5. Save the result as excel
    output_filename = 'availability_results.xlsx'
    final_df.to_excel(output_filename, index=False, engine='openpyxl')
    print(f"\nResults saved to {output_filename}")


if __name__ == '__main__':
    main()


Files read successfully.
Criteria dictionary created with 8 entries.
Global maximum year found in data: 2024
Calculating general availability...


TypeError: calculate_availability() missing 1 required positional argument: 'global_max_year'