In [48]:
import pandas as pd
import numpy as np
import os

MAIN_DATA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/arabic_questionnaires.xlsx'
CRITERIA_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/codes/criterias.xlsx'

In [55]:
def create_criteria_dict(criteria_df, key_language='arabic'):
    """
    Creates a dictionary mapping indicator names to their availability criteria.
    """
    if key_language.lower() == 'english':
        key_col = 'Indicator_En'
    elif key_language.lower() == 'arabic':
        key_col = 'Indicator_Ar'
    else:
        raise ValueError("key_language must be 'english' or 'arabic'")
    criteria_df.dropna(subset=[key_col], inplace=True)
    return pd.Series(criteria_df.criteria.values, index=criteria_df[key_col]).to_dict()
    """
    Calculates availability for each group, returning a collapsed Series (one result per group).
    This function checks if data points for an indicator are present consistently across defined time windows.
    
    Example of the process for a single indicator group:
    1. indicator_name = group.name[...]
       This line just gets the name of the indicator we are working on.
       indicator_name = "Literacy rate"

    2. criteria = criteria_dict.get(indicator_name, 1)
       This looks up the "Literacy rate" in our criteria dictionary and finds its requirement.
       criteria = 2 (meaning we need at least 2 data points per 5-year window)

    3. binned_years = pd.cut(...)
       This is the categorization step. It takes our list of years and puts each one into a 5-year "bucket".
       2011 -> [2010, 2015), 2012 -> [2010, 2015)
       2016 -> [2015, 2020), 2018 -> [2015, 2020)
       2021 -> [2020, 2025), 2022 -> [2020, 2025), 2023 -> [2020, 2025)

    4. window_counts = binned_years.value_counts()
       This step counts how many data points landed in each bucket.
       [2010, 2015): 2
       [2015, 2020): 2
       [2020, 2025): 3

    5. windows_with_sufficient_data = window_counts[window_counts >= criteria]
       This is a filter. It keeps only buckets where the count meets our criteria (>= 2).
       [2010, 2015): Kept (because 2 >= 2)
       [2015, 2020): Kept (because 2 >= 2)
       [2020, 2025): Kept (because 3 >= 2)

    6. sufficient_windows_set = set(windows_with_sufficient_data.index)
       This creates a clean, unique list of the windows that passed the filter.
       sufficient_windows_set = { [2010, 2015), [2015, 2020), [2020, 2025) }

    7. return 1 if len(...) == len(...) else 0
       The final check compares the set of windows with sufficient data against the set of ALL possible windows in our universal time range.
       If they match perfectly, it means the indicator is fully available (returns 1), otherwise it's not (returns 0).
    """


In [56]:
#LOAD AND CLEAN DATA ---

try:
    main_df = pd.read_excel(MAIN_DATA_FILE)
    criteria_df = pd.read_excel(CRITERIA_FILE)
    print("Files read successfully.")
except FileNotFoundError as e:
    print(f"Error reading files: {e}. Make sure the paths are correct.")
    raise e

print("Cleaning source data...")
main_df = main_df.loc[:, ~main_df.columns.str.startswith('Unnamed')]
if 'Theme' in main_df.columns:
    main_df.rename(columns={'Theme': 'الفصل'}, inplace=True)
for col in main_df.select_dtypes(include=['object']).columns:
    main_df[col] = main_df[col].str.strip()

# Keep only rows with a non-null value before doing anything else
main_df.dropna(subset=['المؤشر'], inplace=True)
main_df = main_df[main_df['العدد'].notna()].copy()

# --- NEW BINNING LOGIC ---
def assign_bin_hardcoded(year):
    """Manually assigns a year to a specific, hardcoded bin."""
    if 2010 <= year < 2015:
        return '[2010-2015)'
    elif 2015 <= year < 2020:
        return '[2015-2020)'
    elif 2020 <= year <= 2025:  # NOTE: Includes 2025 as requested
        return '[2020-2025]'
    else:
        return np.nan

print("Creating year bins with hardcoded ranges...")
main_df['year_bins'] = main_df['السنة'].apply(assign_bin_hardcoded)
main_df.dropna(subset=['year_bins'], inplace=True) # Remove rows outside the defined bins

print("Data cleaning and binning complete.")

Files read successfully.
Cleaning source data...
Creating year bins with hardcoded ranges...
Data cleaning and binning complete.


In [57]:
#PREPARE FOR CALCULATION

criteria_dict_ar = create_criteria_dict(criteria_df, key_language='arabic')
print(f"Criteria dictionary created with {len(criteria_dict_ar)} entries.")

Criteria dictionary created with 85 entries.


In [58]:
# %% --- CELL 5: CALCULATE AVAILABILITY SCORES (CORRECTED LOGIC) ---

print("\n--- Calculating Availability Scores ---")

# The correct rule: Availability = 1 ONLY IF the indicator has data in all 3 defined time bins.

# --- General Availability ---
# Group by indicator/country and count the number of unique bins for each.
general_bin_counts = main_df.groupby(['المؤشر', 'الدولة'])['year_bins'].nunique()
# The final score is 1 only if the count of bins is exactly 3.
final_general_scores = (general_bin_counts == 3).astype(int)

# --- Nationality Availability ---
df_nationality = main_df[main_df['المواطنة'].isin(['مواطنون', 'غير مواطنين'])]
nat_bin_counts = df_nationality.groupby(['المؤشر', 'الدولة'])['year_bins'].nunique()
# The final score is 1 only if the count of bins is exactly 3.
final_nationality_scores = (nat_bin_counts == 3).astype(int)

# --- Area Availability ---
if 'المنطقة' in main_df.columns:
    df_area = main_df[main_df['المنطقة'].isin(['حضر', 'ريف'])]
    area_bin_counts = df_area.groupby(['المؤشر', 'الدولة'])['year_bins'].nunique()
    # The final score is 1 only if the count of bins is exactly 3.
    final_area_scores = (area_bin_counts == 3).astype(int)
else:
    final_area_scores = pd.Series(dtype=int)

# --- Combine the results ---
indicator_country_scores = pd.DataFrame({
    'التوفر كلي': final_general_scores,
    'التوفر حسب المواطنية': final_nationality_scores,
    'التوفر حسب المنطقة': final_area_scores
}).reset_index().fillna(0)

print("Availability scores calculated successfully.")
# You can now display `indicator_country_scores.head()` in a new cell to inspect the final result.


--- Calculating Availability Scores ---
Availability scores calculated successfully.


In [None]:
#CREATE MASTERFILE

print("\n--- Creating Detailed Masterfile ---")

source_data_with_scores = pd.merge(main_df, indicator_country_scores, on=['المؤشر', 'الدولة'], how='left')

categorical_cols = ['الفصل', 'المؤشر', 'الدولة', 'المواطنة', 'المنطقة']
existing_categorical_cols = [col for col in categorical_cols if col in source_data_with_scores.columns]

# Fill NaNs with a placeholder string to make them mergeable
for col in existing_categorical_cols:
    if source_data_with_scores[col].dtype == 'object':
        source_data_with_scores[col].fillna('N/A', inplace=True)

# Create a grid of all unique combinations of categories
unique_combinations = source_data_with_scores[existing_categorical_cols].drop_duplicates()
print(f"Found {len(unique_combinations)} unique categorical combinations for the grid.")

# Create a grid of all years
data_max_year = main_df['السنة'].max()
grid_max_year = max(data_max_year, 2025)
all_years = pd.DataFrame({'السنة': range(2010, grid_max_year + 1)})

# Combine categories and years to create the complete blank grid
complete_grid = unique_combinations.merge(all_years, how='cross')
print(f"Created a complete grid with {len(complete_grid)} rows.")

# Merge the actual data onto the complete grid
merge_keys = existing_categorical_cols + ['السنة']
masterfile_df = pd.merge(complete_grid, source_data_with_scores, on=merge_keys, how='left')

# Forward and back fill the availability scores to fill in the blank years
availability_cols = ['التوفر كلي', 'التوفر حسب المواطنية', 'التوفر حسب المنطقة']
availability_cols_exist = [col for col in availability_cols if col in masterfile_df.columns]

if availability_cols_exist:
    masterfile_df[availability_cols_exist] = masterfile_df.groupby(['المؤشر', 'الدولة'])[availability_cols_exist].ffill().bfill()
    masterfile_df[availability_cols_exist] = masterfile_df[availability_cols_exist].fillna(0).astype(int)

# Select and order the final columns
final_master_cols = ['الفصل', 'المؤشر', 'الدولة', 'السنة', 'العدد', 'المواطنة', 'المنطقة', 'year_bins'] + availability_cols
cols_to_keep = [col for col in final_master_cols if col in masterfile_df.columns]
masterfile_df = masterfile_df[cols_to_keep]

# Define the columns to check for duplicates (all final columns EXCEPT the value column 'العدد')
duplicate_check_cols = [col for col in cols_to_keep if col != 'العدد']

# Remove duplicate rows based on this subset, keeping the first unique row.
masterfile_df.drop_duplicates(subset=duplicate_check_cols, keep='first', inplace=True)
print("Duplicate rows (excluding the 'العدد' column) removed.")

# Save the masterfile
masterfile_df.to_excel('masterfile_detailed_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'masterfile_detailed_availability.xlsx'.")


--- Creating Detailed Masterfile ---
Found 253 unique categorical combinations for the grid.
Created a complete grid with 4048 rows.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  source_data_with_scores[col].fillna('N/A', inplace=True)


Successfully saved 'masterfile_detailed_availability.xlsx'.


In [60]:
#GENERATE AGGREGATED REPORTS

print("\n--- Generating and Saving All Aggregated Reports ---")
availability_cols = ['التوفر كلي', 'التوفر حسب المواطنية', 'التوفر حسب المنطقة']

# --- File 1: main_availability.xlsx ---
# This file shows the final 0/1 availability for each indicator/country pair.
main_availability_agg_df = masterfile_df.groupby(['المؤشر', 'الفصل', 'الدولة']).agg({
    'التوفر كلي': 'max',
    'التوفر حسب المواطنية': 'max',
    'التوفر حسب المنطقة': 'max'
}).reset_index()
main_availability_agg_df.to_excel('main_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'main_availability.xlsx'")


# --- File 2: main_availability_percentage.xlsx ---
# This file shows the percentage of available indicators per theme and country.
long_availability_df = main_availability_agg_df.melt(
    id_vars=['المؤشر', 'الفصل', 'الدولة'],
    value_vars=availability_cols,
    var_name='نوع التوفر',
    value_name='متوفر'
)
total_indicators = main_df['المؤشر'].nunique()
availability_sums = long_availability_df.groupby(['الفصل', 'الدولة', 'نوع التوفر'])['متوفر'].apply(
    lambda x: (x.sum() / total_indicators) * 100 if total_indicators > 0 else 0
).reset_index(name='نسبة التوفر')
availability_sums = availability_sums[availability_sums['نوع التوفر'] == 'التوفر كلي']
availability_sums.to_excel('main_availability_percentage.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'main_availability_percentage.xlsx'")


# --- File 3: theme_country_availability.xlsx ---
# This file shows the number and percentage of available indicators for each theme/country.
indicators_per_theme = main_df.groupby('الفصل')['المؤشر'].nunique().reset_index(name='total_indicators_in_theme')
theme_country_sums = main_availability_agg_df.groupby(['الفصل', 'الدولة'])[availability_cols].sum().reset_index()
theme_country_agg_df = pd.merge(theme_country_sums, indicators_per_theme, on='الفصل', how='left')
for col in availability_cols:
    theme_country_agg_df[f'{col}_نسبة'] = (theme_country_agg_df[col] / theme_country_agg_df['total_indicators_in_theme']) * 100
theme_country_agg_df.to_excel('theme_country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'theme_country_availability.xlsx'")


# --- File 4: indicator_country_availability.xlsx ---
# This file shows the number and percentage of countries that have data for each indicator.
indicator_sums = main_availability_agg_df.groupby('المؤشر')[availability_cols].sum().reset_index()
total_countries = main_df['الدولة'].nunique()
if total_countries > 0:
    for col in availability_cols:
        indicator_sums[f'{col}_نسبة'] = (indicator_sums[col] / total_countries) * 100
indicator_sums.to_excel('indicator_country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'indicator_country_availability.xlsx'")


# --- File 5: country_availability.xlsx ---
# This file shows the number and percentage of available indicators for each country.
country_sums = main_availability_agg_df.groupby('الدولة')[availability_cols].sum().reset_index()
if total_indicators > 0:
    for col in availability_cols:
        country_sums[f'{col}_نسبة'] = (country_sums[col] / total_indicators) * 100
country_sums['التوفر السابق'] = ''
country_sums.to_excel('country_availability.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'country_availability.xlsx'")


# --- File 6 & 7: Heatmap Files ---
# Disaggregated Heatmap
heatmap_df_disaggregated = masterfile_df.groupby(
    ['المؤشر', 'الدولة', 'السنة', 'المواطنة', 'المنطقة', 'الفصل']
).agg({'التوفر كلي': 'max'}).reset_index()
heatmap_df_disaggregated.to_excel('heatmap_disaggregated.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'heatmap_disaggregated.xlsx'")

# Total (Aggregated) Heatmap
heatmap_df_total = masterfile_df.groupby(
    ['المؤشر', 'الدولة', 'السنة', 'الفصل']
).agg({'التوفر كلي': 'max'}).reset_index()
heatmap_df_total.to_excel('heatmap_total.xlsx', index=False, engine='openpyxl')
print("Successfully saved 'heatmap_total.xlsx'")


print("\nAnalysis complete.")


--- Generating and Saving All Aggregated Reports ---
Successfully saved 'main_availability.xlsx'
Successfully saved 'main_availability_percentage.xlsx'
Successfully saved 'theme_country_availability.xlsx'
Successfully saved 'indicator_country_availability.xlsx'
Successfully saved 'country_availability.xlsx'
Successfully saved 'heatmap_disaggregated.xlsx'
Successfully saved 'heatmap_total.xlsx'

Analysis complete.
