In [1]:
import pandas as pd
import numpy as np
import json
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build

pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None) # Do not truncate column content
pd.set_option('display.width', 1000)        # Prevent wrapping of wide DataFrames

### Generating urls and gids for the shared google sheet tabs

In [2]:
# Path to your service account JSON
SERVICE_ACCOUNT_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/online questionnnaires/credentials_serviceaccount.json'

# Define API scopes
SCOPES = [
    'https://www.googleapis.com/auth/drive.readonly',
    'https://www.googleapis.com/auth/spreadsheets.readonly'
]

# Authenticate and build the Drive and Sheets services
creds = Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)

drive_service = build('drive', 'v3', credentials=creds)
sheets_service = build('sheets', 'v4', credentials=creds)

# Main folder ID
MAIN_FOLDER_ID = '1nZ8l69GkB6EQQPoiOF-6CEVx2q3vAhcU'

keywords = ['Health', 'Education', 'Housing', 'Population', 'Labor', 'Poverty']
results = []

def list_files_in_folder(folder_id, folder_path):
    """
    Recursively lists Google Sheets in a folder and its subfolders.
    folder_path: list of folder names leading to current folder
    """

    # List subfolders
    query_folders = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder'"
    response_folders = drive_service.files().list(q=query_folders, fields="files(id, name)").execute()
    subfolders = response_folders.get('files', [])

    # Process subfolders recursively
    for subfolder in subfolders:
        subfolder_id = subfolder['id']
        subfolder_name = subfolder['name']
        list_files_in_folder(subfolder_id, folder_path + [subfolder_name])

    # List Google Sheets in the current folder
    query_files = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.spreadsheet'"
    response_files = drive_service.files().list(q=query_files, fields="files(id, name)").execute()
    files = response_files.get('files', [])

    for file in files:
        file_id = file['id']
        file_name = file['name']

        # Get sheet tabs and gids
        sheet_metadata = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
        sheets = sheet_metadata.get('sheets', [])

        for sheet in sheets:
            tab_name = sheet['properties']['title']

            if any(keyword.lower() in tab_name.lower() for keyword in keywords):
                gid = sheet['properties']['sheetId']
                url = f"https://docs.google.com/spreadsheets/d/{file_id}/edit#gid={gid}"

                results.append({
                    'folder_path': '/'.join(folder_path),
                    'file_name': file_name,
                    'tab_name': tab_name,
                    'gid': gid,
                    'url': url
                })

# NEW CHANGE: Manually define the list of folders you want to process.
# The script will look for these folder names inside the MAIN_FOLDER_ID.
submitted_folders = ["Tunisia"] # You can add more folder names here, e.g., ["Tunisia", "Algeria", "Egypt"]

# NEW CHANGE: Instead of processing everything in the main folder, we'll first find the folders
# that match the names in the 'submitted_folders' list.

# First, list all subfolders directly under the MAIN_FOLDER_ID
print("Searching for specified folders within the main directory...")
query_country_folders = f"'{MAIN_FOLDER_ID}' in parents and mimeType='application/vnd.google-apps.folder'"
response_country_folders = drive_service.files().list(q=query_country_folders, fields="files(id, name)").execute()
all_country_folders = response_country_folders.get('files', [])

# Now, loop through the folders we found
for country_folder in all_country_folders:
    folder_name = country_folder['name']
    folder_id = country_folder['id']

    # Check if this folder's name is in our manually created 'submitted_folders' list
    if folder_name in submitted_folders:
        # If it's a match, we start the recursive file search from this folder
        print(f"Found and processing folder: {folder_name}")
        # We pass ['Main', folder_name] to keep the folder path structure consistent with the original script
        list_files_in_folder(folder_id, ['Main', folder_name])

print("Finished searching for files in the specified folders.")


# Convert results to DataFrame
df_urls = pd.DataFrame(results)

#edit column url by replacing "/edit#gid=" with "/export?format=csv&gid="
df_urls['pandas_url']=df_urls['url'].apply(lambda x: x.replace('/edit#gid=', '/export?format=csv&gid='))

# Save to CSV
output_path = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/online questionnnaires/urls_gids.csv'
df_urls.to_csv(output_path, index=False)
print(f"URLs and GIDs saved to {output_path}")


Searching for specified folders within the main directory...
Found and processing folder: Tunisia
Finished searching for files in the specified folders.
URLs and GIDs saved to C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/online questionnnaires/urls_gids.csv


In [3]:
#Display head of the urls dataframe
print("\n--- URLs DataFrame Head ---")
print(df_urls.head(3))
print("-" * 25)


--- URLs DataFrame Head ---
           folder_path                              file_name        tab_name         gid                                                                                                      url                                                                                                            pandas_url
0  Main/Tunisia/Arabic  Tunisia_Population_Questionnaire 2025  Population_1_a  1667751247  https://docs.google.com/spreadsheets/d/1FHAP3PBdEYi14K8MhJgUsVnT28FQUji8THTTaU_QS2Y/edit#gid=1667751247  https://docs.google.com/spreadsheets/d/1FHAP3PBdEYi14K8MhJgUsVnT28FQUji8THTTaU_QS2Y/export?format=csv&gid=1667751247
1  Main/Tunisia/Arabic  Tunisia_Population_Questionnaire 2025  Population_1_b   592698394   https://docs.google.com/spreadsheets/d/1FHAP3PBdEYi14K8MhJgUsVnT28FQUji8THTTaU_QS2Y/edit#gid=592698394   https://docs.google.com/spreadsheets/d/1FHAP3PBdEYi14K8MhJgUsVnT28FQUji8THTTaU_QS2Y/export?format=csv&gid=592698394
2  Main/Tunisia/Arabic  Tunisia_

### Read in csvs from google drive

In [None]:
#read in the dataframe, separate Source row, reshape the dataframe and append the source row
def process_df(df, theme):
    #clean it from the null rows
    df1 = df.dropna(how='all')

    #get the row and column index where source appear
    source_row_index=df1[df1.isin(['المصدر','Source', 'source']).any(axis=1)].index[0]
    #column where source appear
    source_col=df.columns[df1.isin(['المصدر','Source', 'source']).any(axis=0)][0]
    #get the index of source_col
    source_col_index=df1.columns.get_loc(source_col)
    print(f'Source row and column index are: {source_row_index},{source_col_index}')

    #separating the dataframe between main dataframe and the source dataframe
    main_df=df1.iloc[0:source_row_index-1].reset_index(drop=True)
    source_df=df1.iloc[source_row_index-1 :,source_col_index :].reset_index(drop=True)


    #melt main dataframe, transpose the source dataframe and then merge together
    #get years columns
    year_columns = [col for col in df.columns if str(col).isdigit()]
    print(f'years columns are: {year_columns}, \n')

    id_variables= list(set(main_df.columns) - set(year_columns))

    df_main_long = pd.melt(main_df, 
                    id_vars=id_variables,     # columns to keep
                    value_vars=year_columns,  # columns to unpivot
                    var_name='Year', 
                    value_name='Value',
                    ignore_index=True)

    #transpose source dataframe
    source_df_T=source_df.T.reset_index()

    cols=['Year','Source']
    #remove the 1st row
    source_df_T=source_df_T.iloc[1:]
    source_df_T.columns=cols

    #merge the 2 dataframes
    merged_data=pd.merge(df_main_long, source_df_T, on='Year')

    merged_data['Theme'] = theme

    return merged_data

#########################################################################

df_ar=[]
df_en=[]
c=0
#loop over gid
# Extract the theme (e.g., 'Health', 'Education') from the tab_name.
for idx, row in df_urls.iterrows():

    theme = "Unknown" # Default value in case no keyword is found
    for keyword in keywords:
        if keyword.lower() in row['tab_name'].lower():
            theme = keyword
            break # Exit the loop once a match is found

    #for English questionnaires
    if 'English' in row['folder_path']:
        print(f"Processing English file: {row['folder_path']}/{row['file_name']} | Tab: {row['tab_name']} | Theme: {theme}")
        try:
            df=pd.read_csv(row['pandas_url'])
            # #################### START OF CODE CHANGE ####################
            # Pass the extracted 'theme' to the processing function.
            df_result=process_df(df, theme)
            # #################### END OF CODE CHANGE ####################
            if not df_result.empty:
                df_en.append(df_result)
        except Exception as e:
            print(f"Could not process English file {row['file_name']}. Error: {e}")

        c+=1
    
    #else process the Arabic questionnaires
    else:
        print(f"Processing Arabic file: {row['folder_path']}/{row['file_name']} | Tab: {row['tab_name']} | Theme: {theme}")
        try:
            df=pd.read_csv(row['pandas_url'])
            # #################### START OF CODE CHANGE ####################
            # Pass the extracted 'theme' to the processing function.
            df_result=process_df(df, theme)
            # #################### END OF CODE CHANGE ####################
            if not df_result.empty:
                df_ar.append(df_result)
        except Exception as e:
            print(f"Could not process Arabic file {row['file_name']}. Error: {e}")

        c+=1

In [None]:
# Concatenate the lists into final dataframes
if df_ar:
    final_ar_df = pd.concat(df_ar, ignore_index=True)
    #change the Theme in arabic
    final_ar_df['Theme']=final_ar_df['Theme'].map({'Population':'السكان', 'Health':'الصحة','Education':'التعليم','Housing':'السكن','Labor':'العمالة','Poverty':'الفقر'})
    #rename columns
    final_ar_df.rename(columns={'Year': 'السنة', 'Value': 'العدد','Source': 'المصدر', 'Theme':'الفصل'}, inplace=True)
    final_ar_df.to_excel("arabic_questionnaires.xlsx", index=False)

if df_en:
    final_en_df = pd.concat(df_en, ignore_index=True)
    final_en_df.to_excel("english_questionnaires.xlsx", index=False)

print("\nScript finished.")


Script finished.


In [7]:
final_ar_df.head()

Unnamed: 0,المؤشر,الجنس,المواطنة,الفئة العمرية,الدولة,السنة,العدد,المصدر,Theme,المنطقة,الحالة الزوجية,التصنيف الدولي لاسباب الوفاة,سبب الوفاة,أسباب البقاء خارج القوى العاملة,وضع العمالة,أقسام النشاط الإقتصادي,القطاع المؤسسي,أقسام المهن الرئيسية,نوع مكان الإقامة,نوع حيازة الوحدات السكنية,مصدر مياه الشرب,أنواع نظام التخلص من مياه الصرف الصحي,مصدر الإضاءة,الفئة,نوع الخدمات/المنتجات,المرحلة التعليمية
0,حجم السكان حسب المواطنة,ذكور,مجموع المواطنين وغير المواطنين,0-4 سنوات,تونس,2010,469637.27,المعهد الوطني للإحصاء,السكان,,,,,,,,,,,,,,,,,
1,حجم السكان حسب المواطنة,ذكور,مجموع المواطنين وغير المواطنين,5-9 سنوات,تونس,2010,440018.6,المعهد الوطني للإحصاء,السكان,,,,,,,,,,,,,,,,,
2,حجم السكان حسب المواطنة,ذكور,مجموع المواطنين وغير المواطنين,10-14 سنة,تونس,2010,452517.44,المعهد الوطني للإحصاء,السكان,,,,,,,,,,,,,,,,,
3,حجم السكان حسب المواطنة,ذكور,مجموع المواطنين وغير المواطنين,15-19 سنة,تونس,2010,473646.43,المعهد الوطني للإحصاء,السكان,,,,,,,,,,,,,,,,,
4,حجم السكان حسب المواطنة,ذكور,مجموع المواطنين وغير المواطنين,20-24 سنة,تونس,2010,478644.72,المعهد الوطني للإحصاء,السكان,,,,,,,,,,,,,,,,,


### Translate the datasets

In [None]:
#read in the dictionary excel file and make the English dimensions lower case
path='C:/Users/511232/United Nations/ESCWA-SD - Documents/General/SD/Teams/Demographic and SOCIAL S  Team/0- Compendium - Arab Society/Compendium 2025-2026'
df_translation=pd.read_excel(path+'/translation dict.xlsx')


#create a English to Arabic dictionary
En_Ar_dictionary={}

for dim in [d for d in df_translation['col_en'].unique() if d not in ['year', 'value', 'source']]:
    df_dim=df_translation[df_translation['col_en'].isin([dim.lower(), dim])].copy()
    En_Ar_dictionary.update(
        {dim:{'dim_values':dict(zip(df_dim['val_en'], df_dim['val_ar'])), 
              'dim': {df_dim['col_en'].unique()[0]:df_dim['col_ar'].unique()[0]}}})
    
Ar_En_dictionary={}

for dim in [d for d in df_translation['col_ar'].unique() if d not in ['السنة', 'العدد', 'المصدر']]:
    df_dim=df_translation[df_translation['col_ar'].isin([dim.lower(), dim])].copy()
    Ar_En_dictionary.update(
        {dim:{'dim_values':dict(zip(df_dim['val_ar'], df_dim['val_en'])), 
              'dim': {df_dim['col_ar'].unique()[0]:df_dim['col_en'].unique()[0]}}})


In [None]:
Ar_En_dictionary.keys()

dict_keys(['المؤشر', 'الدولة', 'المواطنة', 'الجنس', 'القطاع', 'المرحلة التعليمية', 'الفئة', 'نوع\xa0الخدمات/المنتجات', 'نوع مكان الإقامة', 'نوع حيازة الوحدات السكنية', 'المنطقة', 'مصدر مياه الشرب', 'أنواع نظام التخلص من مياه الصرف الصحي', 'مصدر الإضاءة', 'الفئة العمرية', 'أسباب البقاء خارج القوى العاملة', 'وضع العمالة', 'أقسام النشاط الإقتصادي', 'القطاع المؤسسي', 'أقسام المهن الرئيسية', 'أسباب الوفيات (نسبة مئوية) (ICD 10)', 'أسباب الوفيات (نسبة مئوية) (ICD 11)', 'الحالة الزوجية'])

In [None]:
class Translator:
    def __init__(self, translate_to, en_ar_dict=None,ar_en_dict=None):
        if translate_to.lower() not in ['english', 'arabic']:
            raise ValueError("Language must be 'english' or 'arabic'")

        self.translate_to = translate_to.lower()

        #Let the user pass their own translation dictionaries if they want, otherwise use the defaults:
        self.en_ar_dict = en_ar_dict if en_ar_dict is not None else En_Ar_dictionary
        self.ar_en_dict = ar_en_dict if ar_en_dict is not None else Ar_En_dictionary

    #translate the column names and values
    def translate(self, df):
        df_translated = df.copy()
        translation_dict = self.ar_en_dict if self.translate_to == 'english' else self.en_ar_dict

        for col, col_dict in translation_dict.items():
            if col in df_translated.columns:
                # Get the translated name
                new_col_name = list(col_dict['dim'].values())[0]  
                print(f'Column "{col}" is matched and translated to {col_dict['dim']}.')
                #rename the column values
                df_translated[col] = df_translated[col].replace(col_dict['dim_values'])
                #rename the column
                df_translated.rename(columns=col_dict['dim'], inplace=True)
                print(f'Translated values: {df_translated[new_col_name].unique()}\n')

        return df_translated

In [None]:
translator = Translator('english')
df_translated = translator.translate(df_combined)

In [None]:
df_translated.head()

Unnamed: 0,Country,Nationality,Sex,Indicator,Year,Value,Source,Sector,Education level
0,Somalia,Nationality Total,Male,Net enrolment rate in primary education (percent),2010,1,a,,
1,Somalia,Nationality Total,إناث,Net enrolment rate in primary education (percent),2010,1,a,,
2,Somalia,Nationality Total,Both sexes,Net enrolment rate in primary education (percent),2010,1,a,,
3,Somalia,Nationals,Male,Net enrolment rate in primary education (percent),2010,1,a,,
4,Somalia,Nationals,إناث,Net enrolment rate in primary education (percent),2010,1,a,,


### concatenate all arabic and english dataframes

### RESHAPE AND MERGE LOCALLY SAVED FILES

In [None]:
# import pandas as pd
# import numpy as np
# import os 

# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', None)     # Show all rows
# pd.set_option('display.max_colwidth', None) # Do not truncate column content
# pd.set_option('display.width', 1000)        # Prevent wrapping of wide DataFrames

# # The script will read all Excel files (.xlsx, .xls) from this folder.
# LOCAL_FOLDER_PATH = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/Oman'

# # These keywords are used to identify relevant sheets by their name.
# keywords = ['Health', 'Education', 'Housing', 'Population', 'Labor', 'Poverty']

# # Read in sheets from the local Excel file
# #read in the dataframe, separate Source row, reshape the dataframe and append the source row
# def process_df(df, theme):
#     #clean it from the null rows
#     df1 = df.dropna(how='all')

#     #get the row and column index where source appear
#     try:
#         source_row_index=df1[df1.isin(['المصدر','Source', 'source']).any(axis=1)].index[0]
#     except IndexError:
#         print(f"Warning: 'Source' keyword not found in a sheet. Skipping processing for this sheet.")
#         return pd.DataFrame() # Return empty DataFrame if source is not found
        
#     #column where source appear
#     source_col=df.columns[df1.isin(['المصدر','Source', 'source']).any(axis=0)][0]
#     #get the index of source_col
#     source_col_index=df1.columns.get_loc(source_col)
#     print(f'Source row and column index are: {source_row_index},{source_col_index}')

#     #separating the dataframe between main dataframe and the source dataframe
#     main_df=df1.iloc[0:source_row_index-1].reset_index(drop=True)
#     source_df=df1.iloc[source_row_index-1 :,source_col_index :].reset_index(drop=True)


#     #melt main dataframe, transpose the source dataframe and then merge together
#     #get years columns
#     year_columns = [col for col in df.columns if str(col).isdigit()]
#     print(f'years columns are: {year_columns}, \n')

#     if not year_columns:
#         print("Warning: No year columns found to melt. Skipping this sheet.")
#         return pd.DataFrame()

#     id_variables= list(set(main_df.columns) - set(year_columns))

#     df_main_long = pd.melt(main_df, 
#                            id_vars=id_variables,    # columns to keep
#                            value_vars=year_columns, # columns to unpivot
#                            var_name='Year', 
#                            value_name='Value',
#                            ignore_index=True)

#     #transpose source dataframe
#     source_df_T=source_df.T.reset_index()

#     cols=['Year','Source']
#     #remove the 1st row
#     source_df_T=source_df_T.iloc[1:]
#     source_df_T.columns=cols

#     #merge the 2 dataframes
#     merged_data=pd.merge(df_main_long, source_df_T, on='Year')
#     merged_data['Theme'] = theme

#     return merged_data

# #########################################################################

# df_ar=[]
# df_en=[]
# c=0

# # reads all Excel files from the specified folder.
# try:
#     # Get a list of all files in the directory that end with .xlsx or .xls
#     excel_files = [f for f in os.listdir(LOCAL_FOLDER_PATH) if f.endswith(('.xlsx', '.xls'))]
    
#     if not excel_files:
#         print(f"Error: No Excel files found in the specified folder: {LOCAL_FOLDER_PATH}")
#     else:
#         print(f"Found Excel files to process: {excel_files}")

#     # Loop over each Excel file found in the folder
#     for excel_file in excel_files:
#         file_path = os.path.join(LOCAL_FOLDER_PATH, excel_file)
#         print(f"\n--- Processing File: {excel_file} ---")
        
#         # Load the excel file and get all sheet names
#         xls = pd.ExcelFile(file_path)
#         sheet_names = xls.sheet_names

#         print(f"Found sheets: {sheet_names}")

#         #loop over sheet names
#         for sheet_name in sheet_names:
#             # Check if the sheet name contains one of the keywords to see if we should process it
#             if any(keyword.lower() in sheet_name.lower() for keyword in keywords):
                
#                 # Determine the theme from the sheet name
#                 theme = "Unknown"
#                 for keyword in keywords:
#                     if keyword.lower() in sheet_name.lower():
#                         theme = keyword
#                         break # Exit the loop once a match is found

#                 # Read the specific sheet into a dataframe
#                 df = pd.read_excel(xls, sheet_name=sheet_name)

#                 # Determine language based on sheet name, defaulting to Arabic
#                 if 'english' in sheet_name.lower():
#                     print(f"Processing English sheet: {sheet_name} | Theme: {theme}")
#                     df_result = process_df(df, theme)
#                     if not df_result.empty:
#                         df_en.append(df_result)
#                 else:
#                     print(f"Processing Arabic sheet: {sheet_name} | Theme: {theme}")
#                     df_result = process_df(df, theme)
#                     if not df_result.empty:
#                         df_ar.append(df_result)
                
#                 c += 1
#             else:
#                 print(f"Skipping sheet '{sheet_name}' as it does not contain any target keywords.")

# except FileNotFoundError:
#     print(f"ERROR: The folder was not found at the specified path: {LOCAL_FOLDER_PATH}")
# except Exception as e:
#     print(f"An unexpected error occurred: {e}")


# # Concatenate the lists into final dataframes
# if df_ar:
#     final_ar_df = pd.concat(df_ar, ignore_index=True)
#     #change the Theme in arabic
#     final_ar_df['Theme']=final_ar_df['Theme'].map({'Population':'السكان', 'Health':'الصحة','Education':'التعليم','Housing':'السكن','Labor':'العمالة','Poverty':'الفقر'})
#     #rename columns
#     final_ar_df.rename(columns={'Year': 'السنة', 'Value': 'العدد','Source': 'المصدر','Theme':'الفصل'}, inplace=True)
#     final_ar_df.to_excel("arabic_questionnaires.xlsx", index=False)


# if df_en:
#     final_en_df = pd.concat(df_en, ignore_index=True)
#     final_en_df.to_excel("english_questionnaires.xlsx", index=False)

# print(f"\nScript finished. Processed {c} relevant sheets.")

In [None]:
# final_ar_df.head()

In [None]:
# ### Translate the datasets
# #read in the dictionary excel file and make the English dimensions lower case
# path='C:/Users/511232/United Nations/ESCWA-SD - Documents/General/SD/Teams/Demographic and SOCIAL S  Team/0- Compendium - Arab Society/Compendium 2025-2026'

# try:
#     df_translation=pd.read_excel(path+'/translation dict.xlsx')

#     #create a English to Arabic dictionary
#     En_Ar_dictionary={}

#     for dim in [d for d in df_translation['col_en'].unique() if d not in ['year', 'value', 'source']]:
#         df_dim=df_translation[df_translation['col_en'].isin([dim.lower(), dim])].copy()
#         En_Ar_dictionary.update(
#             {dim:{'dim_values':dict(zip(df_dim['val_en'], df_dim['val_ar'])), 
#                   'dim': {df_dim['col_en'].unique()[0]:df_dim['col_ar'].unique()[0]}}})
        
#     Ar_En_dictionary={}

#     for dim in [d for d in df_translation['col_ar'].unique() if d not in ['السنة', 'العدد', 'المصدر']]:
#         df_dim=df_translation[df_translation['col_ar'].isin([dim.lower(), dim])].copy()
#         Ar_En_dictionary.update(
#             {dim:{'dim_values':dict(zip(df_dim['val_ar'], df_dim['val_en'])), 
#                   'dim': {df_dim['col_ar'].unique()[0]:df_dim['col_en'].unique()[0]}}})

# ###########################################################################################

#     class Translator:
#         def __init__(self, translate_to, en_ar_dict=None,ar_en_dict=None):
#             if translate_to.lower() not in ['english', 'arabic']:
#                 raise ValueError("Language must be 'english' or 'arabic'")

#             self.translate_to = translate_to.lower()

#             #Let the user pass their own translation dictionaries if they want, otherwise use the defaults:
#             self.en_ar_dict = en_ar_dict if en_ar_dict is not None else En_Ar_dictionary
#             self.ar_en_dict = ar_en_dict if ar_en_dict is not None else Ar_En_dictionary

#         #translate the column names and values
#         def translate(self, df):
#             if df.empty:
#                 print("Input DataFrame for translation is empty, skipping.")
#                 return df

#             df_translated = df.copy()
#             translation_dict = self.ar_en_dict if self.translate_to == 'english' else self.en_ar_dict

#             for col, col_dict in translation_dict.items():
#                 if col in df_translated.columns:
#                     # Get the translated name
#                     new_col_name = list(col_dict['dim'].values())[0]  
#                     print(f'Column "{col}" is matched and translated to {col_dict["dim"]}.')
#                     #rename the column values
#                     df_translated[col] = df_translated[col].replace(col_dict['dim_values'])
#                     #rename the column
#                     df_translated.rename(columns=col_dict['dim'], inplace=True)
#                     print(f'Translated values: {df_translated[new_col_name].unique()}\n')

#             return df_translated

#     # The Translator is set to 'english', so it expects an Arabic DataFrame as input.
#     # This now correctly translates the Arabic DataFrame if it exists.
#     if 'final_ar_df' in locals() and not final_ar_df.empty:
#         print("\n--- Translating Arabic DataFrame to English ---")
#         translator = Translator('english')
#         df_translated = translator.translate(final_ar_df)
#         print("\n--- Head of Translated DataFrame ---")
#         print(df_translated.head())
#         df_translated.to_csv("arabic_questionnaires_translated_to_english.csv", index=False)

# except FileNotFoundError:
#     print(f"\nWarning: Translation dictionary not found at '{path}/translation dict.xlsx'. Skipping translation.")
# except Exception as e:
#     print(f"\nAn error occurred during translation: {e}")

### concatenate all arabic and english dataframes

In [18]:
s = [2011, 2013, 2014, 2016, 2022, 2023, 2024]
bins = [2010, 2015, 2020, 2025]
binned_years = pd.cut(s, bins=bins, right=False)
print(binned_years)
window_counts = binned_years.value_counts()
print(window_counts)

[[2010, 2015), [2010, 2015), [2010, 2015), [2015, 2020), [2020, 2025), [2020, 2025), [2020, 2025)]
Categories (3, interval[int64, left]): [[2010, 2015) < [2015, 2020) < [2020, 2025)]
[2010, 2015)    3
[2015, 2020)    1
[2020, 2025)    3
Name: count, dtype: int64


#### check schema with pandera and rapidfuzz to match with possible different column names

In [None]:
import pandas as pd
from rapidfuzz import process
import pandera as pa
from pandera import Column, DataFrameSchema, Check
from pandera.errors import SchemaError
from textwrap import fill

In [None]:
STANDARD_SCHEMA = {
'Indicator': ['indicator','Ind','مؤشر','المؤشر'],
'Country': ['الدولة','البلد','country'],
'Nationality': ['المواطنة','المواطنية','nationality','nationalities'],
'Sex': ['sex','gender','الجنس'],
'Year': ['year','time','Time','سنة','السنة'],
'Value': ['value','number','Number','الرقم','عدد', 'العدد'],
'Comments': ['comments','notes','ملاحظة','المصدر','Source'],
'Sector': ['sector','القطاع'],
'Education level': ['edu level','Education_level','المرحلة التعليمية','التعليم']
}


In [None]:
#map columns to col names using fuzzy match
col_names = {}
for col, aliases in STANDARD_SCHEMA.items():
    match, score, _ = process.extractOne(col, df_translated.columns.tolist() + aliases)
    
    if score > 80:
        print(f'{match} has a score of {score} for column {col}')
        col_names[match] = col

#rename column names according to schema
df_ar_translated=df_translated.rename(col_names)

df_ar_translated.columns

Ind has a score of 90.0 for column Indicator
country has a score of 85.71428571428572 for column Country
nationality has a score of 90.9090909090909 for column Nationality
Year has a score of 100.0 for column Year
Value has a score of 100.0 for column Value
comments has a score of 87.5 for column Comments
sector has a score of 83.33333333333334 for column Sector
Education_level has a score of 93.33333333333333 for column Education level


Index(['الدولة', 'المواطنة', 'الجنس', 'المؤشر', 'Year', 'Value', 'Source', 'القطاع', 'المرحلة التعليمية'], dtype='object')

#### validate the dataframe column labels

In [None]:
df_ar_translated.dtypes

Indicator           object
Country             object
Nationality         object
Sex                 object
Year                 int64
Value              float64
Source              object
Sector              object
Education level     object
dtype: object

In [None]:
translation_df=pd.read_excel('translation dict.xlsx')
translation_df.head(2)

Unnamed: 0,col_en,val_en,col_ar,val_ar
0,Indicator,Net enrolment rate in primary education (percent),المؤشر,معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية)
1,Indicator,Net enrolment rate in secondary education (percent),المؤشر,معدل الإلتحاق الصافي في مرحلة التعليم الثانوي (نسبة مئوية)


In [None]:
list(translation_df.loc[translation_df['col_en']=='Education level', 'val_en'].unique())

['Primary', 'Preparatory', 'Secondary', 'Tertiary', 'All levels']

In [None]:
En_Ar_dictionary.keys()

In [None]:
allowed_indicator_values=list(translation_df.loc[translation_df['col_en']=='Indicator', 'val_en'].unique())
allowed_country_values=list(translation_df.loc[translation_df['col_en']=='Country', 'val_en'].unique())
allowed_nationality_values=list(translation_df.loc[translation_df['col_en']=='Nationality', 'val_en'].unique())
allowed_sex_values=list(translation_df.loc[translation_df['col_en']=='Sex', 'val_en'].unique())
allowed_year_values=[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024]
allowed_sector_values=list(translation_df.loc[translation_df['col_en']=='Sector', 'val_en'].unique())
allowed_edulevel_values=list(translation_df.loc[translation_df['col_en']=='Education level', 'val_en'].unique())

#define schema
schema = DataFrameSchema({
    'Indicator': Column(str, checks=Check.isin(allowed_indicator_values)),
    'Country': Column(str, checks=Check.isin(allowed_country_values)),
    'Nationality': Column(str, checks=Check.isin(allowed_nationality_values),nullable=True),
    'Sex': Column(str, checks=Check.isin(allowed_sex_values),nullable=True),
    'Year': Column(int, checks=Check.isin(allowed_year_values)),
    'Sector': Column(str, checks=Check.isin(allowed_sector_values),nullable=True),
    'Education level': Column(str, checks=Check.isin(allowed_edulevel_values),nullable=True)     
})

#validate the dataframe
try:
    validated_df = schema.validate(df_ar_translated)
except SchemaError as err:
    wrapped_error = fill(str(err.args), width=150)
    print(wrapped_error)

("Column 'Indicator' failed element-wise validator number 0: isin(['Net enrolment rate in primary education (percent)', 'Net enrolment rate in
secondary education (percent)', 'Pupil-teacher ratio (percent)', 'Adult literacy rates', 'Youth literacy rates', 'Expenditure on education as a
percentage of total government expenditure  (percent)', 'Government expenditure on education as a percentage of GDP (percent)']) failure cases: wrong
ind",)


In [None]:
df_ar_translated.loc[0,'Indicator']='wrong ind'
df_ar_translated.head()

Unnamed: 0,Indicator,Country,Nationality,Sex,Year,Value,Source,Sector,Education level
0,wrong ind,Somalia,Nationality Total,Male,2010,1.0,a,,
1,Net enrolment rate in primary education (percent),Somalia,Nationality Total,,2010,1.0,a,,
2,Net enrolment rate in primary education (percent),Somalia,Nationality Total,Both sexes,2010,1.0,a,,
3,Net enrolment rate in primary education (percent),Somalia,Nationals,Male,2010,1.0,a,,
4,Net enrolment rate in primary education (percent),Somalia,Nationals,,2010,1.0,a,,
