In [1]:
import pandas as pd
import numpy as np
import json
from google_auth_oauthlib.flow import InstalledAppFlow
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build

pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None) # Do not truncate column content
pd.set_option('display.width', 1000)        # Prevent wrapping of wide DataFrames

### Generating urls and gids for the shared google sheet tabs

In [4]:
# Path to your service account JSON
SERVICE_ACCOUNT_FILE = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/online questionnnaires/credentials_serviceaccount.json'

# Define API scopes
SCOPES = [
    'https://www.googleapis.com/auth/drive.readonly',
    'https://www.googleapis.com/auth/spreadsheets.readonly'
]

# Authenticate and build the Drive and Sheets services
creds = Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)

drive_service = build('drive', 'v3', credentials=creds)
sheets_service = build('sheets', 'v4', credentials=creds)

# Main folder ID
MAIN_FOLDER_ID = '1nZ8l69GkB6EQQPoiOF-6CEVx2q3vAhcU'

keywords = ['Health', 'Education', 'Housing', 'Population', 'Labor', 'Poverty']
results = []

def list_files_in_folder(folder_id, folder_path):
    """
    Recursively lists Google Sheets in a folder and its subfolders.
    folder_path: list of folder names leading to current folder
    """

    # List subfolders
    query_folders = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder'"
    response_folders = drive_service.files().list(q=query_folders, fields="files(id, name)").execute()
    subfolders = response_folders.get('files', [])

    # Process subfolders recursively
    for subfolder in subfolders:
        subfolder_id = subfolder['id']
        subfolder_name = subfolder['name']
        list_files_in_folder(subfolder_id, folder_path + [subfolder_name])

    # List Google Sheets in the current folder
    query_files = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.spreadsheet'"
    response_files = drive_service.files().list(q=query_files, fields="files(id, name)").execute()
    files = response_files.get('files', [])

    for file in files:
        file_id = file['id']
        file_name = file['name']

        # Get sheet tabs and gids
        sheet_metadata = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
        sheets = sheet_metadata.get('sheets', [])

        for sheet in sheets:
            tab_name = sheet['properties']['title']

            if any(keyword.lower() in tab_name.lower() for keyword in keywords):
                gid = sheet['properties']['sheetId']
                url = f"https://docs.google.com/spreadsheets/d/{file_id}/edit#gid={gid}"

                results.append({
                    'folder_path': '/'.join(folder_path),
                    'file_name': file_name,
                    'tab_name': tab_name,
                    'gid': gid,
                    'url': url
                })

# Start recursion from MAIN_FOLDER_ID
main_folder_name = 'Main'  # optional, or you can retrieve its name via drive_service if needed
list_files_in_folder(MAIN_FOLDER_ID, [main_folder_name])

# Convert results to DataFrame
df_urls = pd.DataFrame(results)

#edit column url by replacing "/edit#gid=" with "/export?format=csv&gid="
df_urls['pandas_url']=df_urls['url'].apply(lambda x: x.replace('/edit#gid=', '/export?format=csv&gid='))

# Save to CSV
output_path = 'C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/online questionnnaires/urls_gids.csv'
df_urls.to_csv(output_path, index=False)
print(f"URLs and GIDs saved to {output_path}")


URLs and GIDs saved to C:/Users/511232/Desktop/DSS/MERGING GOOGLESHEETS QUESTIONNAIRES/online questionnnaires/urls_gids.csv


### loop though urls - read in dataframes, reshape, add the sources column and append to a list - concatenate

In [5]:
#Display head of the urls dataframe
print("\n--- URLs DataFrame Head ---")
print(df_urls.head(3))
print("-" * 25)



--- URLs DataFrame Head ---
           folder_path                              file_name        tab_name         gid                                                                                                      url                                                                                                            pandas_url
0  Main/Tunisia/Arabic  Tunisia_Population_Questionnaire 2025  Population_1_a  1667751247  https://docs.google.com/spreadsheets/d/1FHAP3PBdEYi14K8MhJgUsVnT28FQUji8THTTaU_QS2Y/edit#gid=1667751247  https://docs.google.com/spreadsheets/d/1FHAP3PBdEYi14K8MhJgUsVnT28FQUji8THTTaU_QS2Y/export?format=csv&gid=1667751247
1  Main/Tunisia/Arabic  Tunisia_Population_Questionnaire 2025  Population_1_b   592698394   https://docs.google.com/spreadsheets/d/1FHAP3PBdEYi14K8MhJgUsVnT28FQUji8THTTaU_QS2Y/edit#gid=592698394   https://docs.google.com/spreadsheets/d/1FHAP3PBdEYi14K8MhJgUsVnT28FQUji8THTTaU_QS2Y/export?format=csv&gid=592698394
2  Main/Tunisia/Arabic  Tunisia_

### Read in csvs from google drive

In [6]:
# The function is updated to accept a 'theme' argument which will be added as a column.
def process_df(df, theme):
    """
    Cleans, reshapes a dataframe, merges it with its source information, and adds a theme column.
    """
    #clean it from the null rows
    df1 = df.dropna(how='all')

    #get the row and column index where source appear
    try:
        source_row_index=df1[df1.isin(['المصدر','Source', 'source']).any(axis=1)].index[0]
    except IndexError:
        print("Warning: 'Source' keyword not found. The sheet might have an unexpected format.")
        return pd.DataFrame() # Return empty DataFrame if source is not found

    #column where source appear
    source_col=df.columns[df1.isin(['المصدر','Source', 'source']).any(axis=0)][0]
    #get the index of source_col
    source_col_index=df1.columns.get_loc(source_col)
    print(f'Source row and column index are: {source_row_index},{source_col_index}')

    #separating the dataframe between main dataframe and the source dataframe
    main_df=df1.iloc[0:source_row_index-1].reset_index(drop=True)
    source_df=df1.iloc[source_row_index-1 :,source_col_index :].reset_index(drop=True)


    #melt main dataframe, transpose the source dataframe and then merge together
    #get years columns
    year_columns = [col for col in df.columns if str(col).isdigit()]
    print(f'years columns are: {year_columns}, \n')

    # Handle cases where there are no year columns
    if not year_columns:
        print("Warning: No year columns found to melt.")
        return pd.DataFrame()

    id_variables= list(set(main_df.columns) - set(year_columns))

    df_main_long = pd.melt(main_df, 
                           id_vars=id_variables,    # columns to keep
                           value_vars=year_columns, # columns to unpivot
                           var_name='Year', 
                           value_name='Value',
                           ignore_index=True)

    #transpose source dataframe
    source_df_T=source_df.T.reset_index()

    cols=['Year','Source']
    #remove the 1st row
    source_df_T=source_df_T.iloc[1:]
    source_df_T.columns=cols

    #merge the 2 dataframes
    merged_data=pd.merge(df_main_long, source_df_T, on='Year')

    # A new 'Theme' column is created and populated with the theme passed to the function.
    merged_data['Theme'] = theme

    return merged_data

    
#########################################################################

df_ar=[]
df_en=[]
c=0
#loop over gid
for idx, row in df_urls.iterrows():

    # Extract the theme (e.g., 'Health', 'Education') from the tab_name.
    # This loop finds which of your keywords is present in the tab name for the current row.
    theme = "Unknown" # Default value in case no keyword is found
    for keyword in keywords:
        if keyword.lower() in row['tab_name'].lower():
            theme = keyword
            break # Exit the loop once a match is found

    #for English questionnaires
    if 'English' in row['folder_path']:
        print(f"Processing English file: {row['folder_path']}/{row['file_name']} | Tab: {row['tab_name']} | Theme: {theme}")
        try:
            df=pd.read_csv(row['pandas_url'])
            # Pass the extracted 'theme' to the processing function.
            df_result=process_df(df, theme)
            if not df_result.empty:
                df_en.append(df_result)
        except Exception as e:
            print(f"Could not process English file {row['file_name']}. Error: {e}")
        c+=1
    
    #else process the Arabic questionnaires
    else:
        print(f"Processing Arabic file: {row['folder_path']}/{row['file_name']} | Tab: {row['tab_name']} | Theme: {theme}")
        try:
            df=pd.read_csv(row['pandas_url'])
            # Pass the extracted 'theme' to the processing function.
            df_result=process_df(df, theme)
            if not df_result.empty:
                df_ar.append(df_result)
        except Exception as e:
            print(f"Could not process Arabic file {row['file_name']}. Error: {e}")
        c+=1
    
    if c==5:
        break


Processing Arabic file: Main/Tunisia/Arabic/Tunisia_Population_Questionnaire 2025 | Tab: Population_1_a | Theme: Population
Source row and column index are: 163,4
years columns are: ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024'], 

Processing Arabic file: Main/Tunisia/Arabic/Tunisia_Population_Questionnaire 2025 | Tab: Population_1_b | Theme: Population
Source row and column index are: 109,4
years columns are: ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024'], 

Processing Arabic file: Main/Tunisia/Arabic/Tunisia_Population_Questionnaire 2025 | Tab: Population_2 | Theme: Population
Source row and column index are: 2,1
years columns are: ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024'], 

Processing Arabic file: Main/Tunisia/Arabic/Tunisia_Population_Questionnaire 2025 | Ta

In [7]:
df_ar[0].head()

Unnamed: 0,الجنس,الدولة,الفئة العمرية,المؤشر,المواطنة,Year,Value,Source,Theme
0,ذكور,تونس,0-4 سنوات,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,469637.27,المعهد الوطني للإحصاء,Population
1,ذكور,تونس,5-9 سنوات,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,440018.6,المعهد الوطني للإحصاء,Population
2,ذكور,تونس,10-14 سنة,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,452517.44,المعهد الوطني للإحصاء,Population
3,ذكور,تونس,15-19 سنة,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,473646.43,المعهد الوطني للإحصاء,Population
4,ذكور,تونس,20-24 سنة,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,478644.72,المعهد الوطني للإحصاء,Population


In [12]:
# Concatenate the lists into final dataframes
if df_ar:
    final_ar_df = pd.concat(df_ar, ignore_index=True)
    #change the Theme in arabic
    final_ar_df['Theme']=final_ar_df['Theme'].map({'Population':'السكان', 'Health':'الصحة','Education':'التعليم','Housing':'السكن','Labor':'العمالة','Poverty':'الفقر'})
    #rename columns
    final_ar_df.rename(columns={'Year': 'السنة', 'Value': 'العدد','Source': 'المصدر', 'Theme':'الفصل'}, inplace=True)
    final_ar_df.to_excel("arabic_questionnaires.xlsx", index=False)


if df_en:
    final_en_df = pd.concat(df_en, ignore_index=True)
    final_en_df.to_excel("english_questionnaires.xlsx", index=False)

print("\nScript finished.")


Script finished.


In [11]:
final_ar_df.head()

Unnamed: 0,الجنس,الدولة,الفئة العمرية,المؤشر,المواطنة,السنة,العدد,المصدر,الفصل,المنطقة
0,ذكور,تونس,0-4 سنوات,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,469637.27,المعهد الوطني للإحصاء,السكان,
1,ذكور,تونس,5-9 سنوات,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,440018.6,المعهد الوطني للإحصاء,السكان,
2,ذكور,تونس,10-14 سنة,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,452517.44,المعهد الوطني للإحصاء,السكان,
3,ذكور,تونس,15-19 سنة,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,473646.43,المعهد الوطني للإحصاء,السكان,
4,ذكور,تونس,20-24 سنة,حجم السكان حسب المواطنة,مجموع المواطنين وغير المواطنين,2010,478644.72,المعهد الوطني للإحصاء,السكان,


In [None]:
len(housing_dfs)

<function len(obj, /)>

### Translate the datasets

In [47]:
#read in the dictionary excel file and make the English dimensions lower case
path='C:/Users/511232/United Nations/ESCWA-SD - Documents/General/SD/Teams/Demographic and SOCIAL S  Team/0- Compendium - Arab Society/Compendium 2025-2026'
df_translation=pd.read_excel(path+'/translation dict.xlsx')


#create a English to Arabic dictionary
En_Ar_dictionary={}

for dim in [d for d in df_translation['col_en'].unique() if d not in ['year', 'value', 'source']]:
    df_dim=df_translation[df_translation['col_en'].isin([dim.lower(), dim])].copy()
    En_Ar_dictionary.update(
        {dim:{'dim_values':dict(zip(df_dim['val_en'], df_dim['val_ar'])), 
              'dim': {df_dim['col_en'].unique()[0]:df_dim['col_ar'].unique()[0]}}})
    
Ar_En_dictionary={}

for dim in [d for d in df_translation['col_ar'].unique() if d not in ['السنة', 'العدد', 'المصدر']]:
    df_dim=df_translation[df_translation['col_ar'].isin([dim.lower(), dim])].copy()
    Ar_En_dictionary.update(
        {dim:{'dim_values':dict(zip(df_dim['val_ar'], df_dim['val_en'])), 
              'dim': {df_dim['col_ar'].unique()[0]:df_dim['col_en'].unique()[0]}}})


In [48]:
Ar_En_dictionary.keys()

dict_keys(['المؤشر', 'الدولة', 'المواطنة', 'الجنس', 'القطاع', 'المرحلة التعليمية', 'الفئة', 'نوع\xa0الخدمات/المنتجات', 'نوع مكان الإقامة', 'نوع حيازة الوحدات السكنية', 'المنطقة', 'مصدر مياه الشرب', 'أنواع نظام التخلص من مياه الصرف الصحي', 'مصدر الإضاءة', 'الفئة العمرية', 'أسباب البقاء خارج القوى العاملة', 'وضع العمالة', 'أقسام النشاط الإقتصادي', 'القطاع المؤسسي', 'أقسام المهن الرئيسية', 'أسباب الوفيات (نسبة مئوية) (ICD 10)', 'أسباب الوفيات (نسبة مئوية) (ICD 11)', 'الحالة الزوجية'])

In [51]:
class Translator:
    def __init__(self, translate_to, en_ar_dict=None,ar_en_dict=None):
        if translate_to.lower() not in ['english', 'arabic']:
            raise ValueError("Language must be 'english' or 'arabic'")

        self.translate_to = translate_to.lower()

        #Let the user pass their own translation dictionaries if they want, otherwise use the defaults:
        self.en_ar_dict = en_ar_dict if en_ar_dict is not None else En_Ar_dictionary
        self.ar_en_dict = ar_en_dict if ar_en_dict is not None else Ar_En_dictionary

    #translate the column names and values
    def translate(self, df):
        df_translated = df.copy()
        translation_dict = self.ar_en_dict if self.translate_to == 'english' else self.en_ar_dict

        for col, col_dict in translation_dict.items():
            if col in df_translated.columns:
                # Get the translated name
                new_col_name = list(col_dict['dim'].values())[0]  
                print(f'Column "{col}" is matched and translated to {col_dict['dim']}.')
                #rename the column values
                df_translated[col] = df_translated[col].replace(col_dict['dim_values'])
                #rename the column
                df_translated.rename(columns=col_dict['dim'], inplace=True)
                print(f'Translated values: {df_translated[new_col_name].unique()}\n')

        return df_translated

In [None]:
translator = Translator('english')
df_translated = translator.translate(df_combined)

In [53]:
df_translated.head()

Unnamed: 0,Country,Nationality,Sex,Indicator,Year,Value,Source,Sector,Education level
0,Somalia,Nationality Total,Male,Net enrolment rate in primary education (percent),2010,1,a,,
1,Somalia,Nationality Total,إناث,Net enrolment rate in primary education (percent),2010,1,a,,
2,Somalia,Nationality Total,Both sexes,Net enrolment rate in primary education (percent),2010,1,a,,
3,Somalia,Nationals,Male,Net enrolment rate in primary education (percent),2010,1,a,,
4,Somalia,Nationals,إناث,Net enrolment rate in primary education (percent),2010,1,a,,


#### check schema with pandera and rapidfuzz to match with possible different column names

In [13]:
import pandas as pd
from rapidfuzz import process
import pandera as pa
from pandera import Column, DataFrameSchema, Check
from pandera.errors import SchemaError
from textwrap import fill

In [14]:
STANDARD_SCHEMA = {
'Indicator': ['indicator','Ind','مؤشر','المؤشر'],
'Country': ['الدولة','البلد','country'],
'Nationality': ['المواطنة','المواطنية','nationality','nationalities'],
'Sex': ['sex','gender','الجنس'],
'Year': ['year','time','Time','سنة','السنة'],
'Value': ['value','number','Number','الرقم','عدد', 'العدد'],
'Comments': ['comments','notes','ملاحظة','المصدر','Source'],
'Sector': ['sector','القطاع'],
'Education level': ['edu level','Education_level','المرحلة التعليمية','التعليم']
}


In [15]:
#map columns to col names using fuzzy match
col_names = {}
for col, aliases in STANDARD_SCHEMA.items():
    match, score, _ = process.extractOne(col, df_translated.columns.tolist() + aliases)
    
    if score > 80:
        print(f'{match} has a score of {score} for column {col}')
        col_names[match] = col

#rename column names according to schema
df_ar_translated=df_translated.rename(col_names)

df_ar_translated.columns

Ind has a score of 90.0 for column Indicator
country has a score of 85.71428571428572 for column Country
nationality has a score of 90.9090909090909 for column Nationality
Year has a score of 100.0 for column Year
Value has a score of 100.0 for column Value
comments has a score of 87.5 for column Comments
sector has a score of 83.33333333333334 for column Sector
Education_level has a score of 93.33333333333333 for column Education level


Index(['الدولة', 'المواطنة', 'الجنس', 'المؤشر', 'Year', 'Value', 'Source', 'القطاع', 'المرحلة التعليمية'], dtype='object')

#### validate the dataframe column labels

In [94]:
df_ar_translated.dtypes

Indicator           object
Country             object
Nationality         object
Sex                 object
Year                 int64
Value              float64
Source              object
Sector              object
Education level     object
dtype: object

In [28]:
translation_df=pd.read_excel('translation dict.xlsx')
translation_df.head(2)

Unnamed: 0,col_en,val_en,col_ar,val_ar
0,Indicator,Net enrolment rate in primary education (percent),المؤشر,معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية)
1,Indicator,Net enrolment rate in secondary education (percent),المؤشر,معدل الإلتحاق الصافي في مرحلة التعليم الثانوي (نسبة مئوية)


In [38]:
list(translation_df.loc[translation_df['col_en']=='Education level', 'val_en'].unique())

['Primary', 'Preparatory', 'Secondary', 'Tertiary', 'All levels']

In [None]:
En_Ar_dictionary.keys()

In [43]:
allowed_indicator_values=list(translation_df.loc[translation_df['col_en']=='Indicator', 'val_en'].unique())
allowed_country_values=list(translation_df.loc[translation_df['col_en']=='Country', 'val_en'].unique())
allowed_nationality_values=list(translation_df.loc[translation_df['col_en']=='Nationality', 'val_en'].unique())
allowed_sex_values=list(translation_df.loc[translation_df['col_en']=='Sex', 'val_en'].unique())
allowed_year_values=[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024]
allowed_sector_values=list(translation_df.loc[translation_df['col_en']=='Sector', 'val_en'].unique())
allowed_edulevel_values=list(translation_df.loc[translation_df['col_en']=='Education level', 'val_en'].unique())

#define schema
schema = DataFrameSchema({
    'Indicator': Column(str, checks=Check.isin(allowed_indicator_values)),
    'Country': Column(str, checks=Check.isin(allowed_country_values)),
    'Nationality': Column(str, checks=Check.isin(allowed_nationality_values),nullable=True),
    'Sex': Column(str, checks=Check.isin(allowed_sex_values),nullable=True),
    'Year': Column(int, checks=Check.isin(allowed_year_values)),
    'Sector': Column(str, checks=Check.isin(allowed_sector_values),nullable=True),
    'Education level': Column(str, checks=Check.isin(allowed_edulevel_values),nullable=True)     
})

#validate the dataframe
try:
    validated_df = schema.validate(df_ar_translated)
except SchemaError as err:
    wrapped_error = fill(str(err.args), width=150)
    print(wrapped_error)

("Column 'Indicator' failed element-wise validator number 0: isin(['Net enrolment rate in primary education (percent)', 'Net enrolment rate in
secondary education (percent)', 'Pupil-teacher ratio (percent)', 'Adult literacy rates', 'Youth literacy rates', 'Expenditure on education as a
percentage of total government expenditure  (percent)', 'Government expenditure on education as a percentage of GDP (percent)']) failure cases: wrong
ind",)


In [42]:
df_ar_translated.loc[0,'Indicator']='wrong ind'
df_ar_translated.head()

Unnamed: 0,Indicator,Country,Nationality,Sex,Year,Value,Source,Sector,Education level
0,wrong ind,Somalia,Nationality Total,Male,2010,1.0,a,,
1,Net enrolment rate in primary education (percent),Somalia,Nationality Total,,2010,1.0,a,,
2,Net enrolment rate in primary education (percent),Somalia,Nationality Total,Both sexes,2010,1.0,a,,
3,Net enrolment rate in primary education (percent),Somalia,Nationals,Male,2010,1.0,a,,
4,Net enrolment rate in primary education (percent),Somalia,Nationals,,2010,1.0,a,,


In [None]:
# #for Algeria alone

# # Path to your service account JSON
# SERVICE_ACCOUNT_FILE = 'C:/Users/511232/Desktop/DSS/Consolidated questionnaires/online questionnnaires/credentials_serviceaccount.json'

# SCOPES = [
#     'https://www.googleapis.com/auth/drive.readonly',
#     'https://www.googleapis.com/auth/spreadsheets.readonly'
# ]

# FOLDER_ID = '1ftIKuPB_bnSKPiBNLblOtaz-11WQDUwq'  # Algeria folder ID

# # -----------------------------
# # AUTHENTICATE
# # -----------------------------
# creds = Credentials.from_service_account_file(
#     SERVICE_ACCOUNT_FILE, scopes=SCOPES)

# drive_service = build('drive', 'v3', credentials=creds)
# sheets_service = build('sheets', 'v4', credentials=creds)

# # -----------------------------
# # LIST GOOGLE SHEETS IN ALGERIA
# # -----------------------------
# query = f"'{FOLDER_ID}' in parents and mimeType='application/vnd.google-apps.spreadsheet'"
# response = drive_service.files().list(q=query, fields="files(id, name)").execute()
# files = response.get('files', [])

# results = []

# keywords=['Health', 'Education','Housing', 'Population', 'Labor', 'Poverty']

# for file in files:
#     file_id = file['id']
#     file_name = file['name']

#     # Get sheet tabs
#     sheet_metadata = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
#     sheets = sheet_metadata.get('sheets', [])

#     for sheet in sheets:
#         tab_name = sheet['properties']['title']

#         # Filter tabs with required keywords
#         if any(keyword.lower() in tab_name.lower() for keyword in keywords):
#             gid = sheet['properties']['sheetId']
#             url = f"https://docs.google.com/spreadsheets/d/{file_id}/edit#gid={gid}"

#             results.append({
#                 'file_name': file_name,
#                 'tab_name': tab_name,
#                 'gid': gid,
#                 'url': url
#             })

# # Display results
# df = pd.DataFrame(results)
# df.head()


In [None]:
# # Path to your service account JSON
# SERVICE_ACCOUNT_FILE = 'C:/Users/511232/Desktop/DSS/Consolidated questionnaires/online questionnnaires/credentials_serviceaccount.json'

# # Define API scopes
# SCOPES = [
#     'https://www.googleapis.com/auth/drive.readonly',
#     'https://www.googleapis.com/auth/spreadsheets.readonly'
# ]

# # Authenticate and build the Drive and Sheets services
# creds = Credentials.from_service_account_file(
#     SERVICE_ACCOUNT_FILE, scopes=SCOPES)

# # Initialize Drive and Sheets services
# drive_service = build('drive', 'v3', credentials=creds)
# sheets_service = build('sheets', 'v4', credentials=creds)
# #######################################################################################################

# # Set main folder ID
# MAIN_FOLDER_ID = '1nZ8l69GkB6EQQPoiOF-6CEVx2q3vAhcU'

# # List all country subfolders in main folder
# query = f"'{MAIN_FOLDER_ID}' in parents and mimeType='application/vnd.google-apps.folder'"
# response = drive_service.files().list(q=query, fields="files(id, name)").execute()
# country_folders = response.get('files', [])

# results = []

# # Loop through each country folder
# keywords=['Health', 'Education','Housing', 'Population', 'Labor', 'Poverty']

# for folder in country_folders:
#     folder_id = folder['id']
#     folder_name = folder['name']
    
#     # List Google Sheets in the country folder
#     query_files = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.spreadsheet'"
#     response_files = drive_service.files().list(q=query_files, fields="files(id, name)").execute()
#     files = response_files.get('files', [])

#     for file in files:
#         file_id = file['id']
#         file_name = file['name']
        
#         #Get sheet tabs and gids
#         sheet_metadata = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
#         sheets = sheet_metadata.get('sheets', [])

#         for sheet in sheets:
#             tab_name = sheet['properties']['title']

#             if any(keyword.lower() in tab_name.lower() for keyword in keywords):

#                 gid = sheet['properties']['sheetId']
#                 url = f"https://docs.google.com/spreadsheets/d/{file_id}/edit#gid={gid}"

#                 results.append({
#                     'country_folder': folder_name,
#                     'file_name': file_name,
#                     'tab_name': tab_name,
#                     'gid': gid,
#                     'url': url
#                 })

# # Display results
# df_urls = pd.DataFrame(results)

# #make the url csv readable

# # df_urls.to_csv('C:/Users/511232/Desktop/DSS/Consolidated questionnaires/online questionnnaires/sheet_tabs_urls_all_countries.csv', index=False)
