In [80]:
import pandas as pd
import numpy as np
import json


pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None) # Do not truncate column content
pd.set_option('display.width', 1000)        # Prevent wrapping of wide DataFrames

### for english questionnaire

In [81]:
f_path = 'C:/Users/511232/Desktop/DSS/Consolidated questionnaires'
file='Somalia_Education_en.xlsx'

sheets_dict = pd.read_excel(f_path+'/'+file, sheet_name=None)  # Reads all sheets at once
print(sheets_dict.keys())

dict_keys(['Education_1_a', 'Education_1_b', 'Education_2_a', 'Education_2_b', 'Education_2_c', 'Education_3', 'Education_4', 'Education_5', 'Education_6'])


In [82]:
'''loop through the sheets, isolate the row where theres source, remove it from the dataframe, reshape the dataframe
repeat the source row equivalent to the non-reshaped dataframe size, append to the df_list and finally concatenate into a single file'''

df_list=[]

for key, df in sheets_dict.items():
    
    print(f'processing sheet {key}')
    #read in the dataframe
    df=sheets_dict[key]

    #clean it from the null rows
    df1 = df.dropna(how='all')

    #isolate the row that contains source
    source_row = df1[df1.apply(lambda row: row.astype(str).str.contains('source', case=False)).any(axis=1)]

    #remove it from main DataFrame
    df_cleaned = df1.drop(source_row.index)

    #get the years column
    year_cols = [c for c in df_cleaned.columns if str(c).isdigit()]
    # year_cols = [c for c in df_cleaned.columns if isinstance(c, int)]
    print(f'years columns: {year_cols}')

    #reshape into long
    df_long = df_cleaned.melt(
        id_vars=[c for c in df_cleaned.columns if c not in year_cols],
        value_vars=year_cols,
        var_name='Year',
        value_name='Value'
    )

    #repeat years and source as much as the dataframe rows
    year_repeated = []
    source_repeated = []

    for year in year_cols:
        year_str = str(year)
        year_repeated.extend([year_str] * df_cleaned.shape[0])
        source_repeated.extend([source_row.iloc[0][year]] * df_cleaned.shape[0])

    # Create a new DataFrame with these repeated values
    source_column_df = pd.DataFrame({
        'Year_source': year_repeated,
        'Source': source_repeated
    })

    # Concatenate with df_long
    df_final = pd.concat([df_long.reset_index(drop=True), source_column_df], axis=1)
    df_final = df_final.drop('Year_source', axis=1)
    print('finished processing')
    print(df_final.head(3), '\n')

    #append to df_list
    df_list.append(df_final)

    consolidated_data=pd.concat(df_list, ignore_index=True)
    # consolidated_data=consolidated_data.dropna(subset=['Value'])
    
    consolidated_data.to_excel('consolidated_data_en.xlsx', index=False)


processing sheet Education_1_a
years columns: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
finished processing
                                           Indicator  Country        Nationality         Sex  Year Value Source
0  Net enrolment rate in primary education (percent)  Somalia  Nationality Total        Male  2010     1      a
1  Net enrolment rate in primary education (percent)  Somalia  Nationality Total      Female  2010   NaN      a
2  Net enrolment rate in primary education (percent)  Somalia  Nationality Total  Both sexes  2010   NaN      a 

processing sheet Education_1_b
years columns: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
finished processing
                                             Indicator  Country        Nationality         Sex  Year Value Source
0  Net enrolment rate in secondary education (percent)  Somalia  Nationality Total        Male  2010     1      a
1  Net e

### for arabic questionnaire

In [83]:
f_path = 'C:/Users/511232/Desktop/DSS/Consolidated questionnaires'
file='Somalia_Education_ar.xlsx'

sheets_dict = pd.read_excel(f_path+'/'+file, sheet_name=None)  # Reads all sheets at once
print(sheets_dict.keys())

dict_keys(['Education_1_a', 'Education_1_b', 'Education_2_a', 'Education_2_b', 'Education_2_c', 'Education_3', 'Education_4', 'Education_5', 'Education_6'])


In [84]:
'''loop through the sheets, isolate the row where theres source, remove it from the dataframe, reshape the dataframe
repeat the source row equivalent to the non-reshaped dataframe size, append to the df_list and finally concatenate into a single file'''

df_list=[]

for key, df in sheets_dict.items():
    
    print(f'processing sheet {key}')
    #read in the dataframe
    df=sheets_dict[key]

    #clean it from the null rows
    df1 = df.dropna(how='all')

    #isolate the row that contains source
    source_row = df1[df1.apply(lambda row: row.astype(str).str.contains('المصدر', case=False)).any(axis=1)]

    #remove it from main DataFrame
    df_cleaned = df1.drop(source_row.index)

    #get the years column
    year_cols = [c for c in df_cleaned.columns if str(c).isdigit()]
    # year_cols = [c for c in df_cleaned.columns if isinstance(c, int)]
    print(f'years columns: {year_cols}')

    #reshape into long
    df_long = df_cleaned.melt(
        id_vars=[c for c in df_cleaned.columns if c not in year_cols],
        value_vars=year_cols,
        var_name='Year',
        value_name='Value'
    )

    #repeat years and source as much as the dataframe rows
    year_repeated = []
    source_repeated = []

    for year in year_cols:
        year_str = str(year)
        year_repeated.extend([year_str] * df_cleaned.shape[0])
        source_repeated.extend([source_row.iloc[0][year]] * df_cleaned.shape[0])

    # Create a new DataFrame with these repeated values
    source_column_df = pd.DataFrame({
        'Year_source': year_repeated,
        'Source': source_repeated
    })

    # Concatenate with df_long
    df_final = pd.concat([df_long.reset_index(drop=True), source_column_df], axis=1)
    df_final = df_final.drop('Year_source', axis=1)
    print('finished processing')
    print(df_final.head(3), '\n')

    #append to df_list
    df_list.append(df_final)

    consolidated_data=pd.concat(df_list, ignore_index=True)
    # consolidated_data=consolidated_data.dropna(subset=['Value'])
    
    consolidated_data.to_excel('consolidated_data_ar.xlsx', index=False)


processing sheet Education_1_a
years columns: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
finished processing
                                                                   المؤشر   الدولة                        المواطنة        الجنس  Year Value Source
0  معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية)  الصومال  مجموع المواطنين وغير المواطنين         ذكور  2010     1      a
1  معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية)  الصومال  مجموع المواطنين وغير المواطنين         إناث  2010     1      a
2  معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية)  الصومال  مجموع المواطنين وغير المواطنين  كلا الجنسين  2010     1      a 

processing sheet Education_1_b
years columns: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
finished processing
                                                       المؤشر   الدولة                        ال

### create a dictionary for translation

In [85]:
df_en=pd.read_excel('C:/Users/511232/Desktop/DSS/Consolidated questionnaires/codes/consolidated_data_en.xlsx')
df_ar=pd.read_excel('C:/Users/511232/Desktop/DSS/Consolidated questionnaires/codes/consolidated_data_ar.xlsx')

In [86]:
#for column names dictionary
column_dict=dict(zip(df_ar.columns, df_en.columns))

#save to JSON, ensure_ascii=False keeps non-ASCII characters (like Arabic) in their original form instead of escaping them as \u0627
with open('columns_dict_ar_to_en.json', 'w', encoding='utf-8') as f:
    json.dump(column_dict, f, ensure_ascii=False, indent=2)


##for values dictionary: get the columns list to loop over
cols_to_exclude=['Year','Value','Source']
cols_list=[(c_ar,c_en)
           for c_ar, c_en in zip(df_ar.columns,df_en.columns)
           if c_en not in cols_to_exclude
           ]

values_dict={}
#populate the dictionary with keys as column names in arabic and values as their unique values in arabic and english
for c in cols_list:
    ind_ar=c[0]
    ind_en=c[1]

    values_dict[ind_ar]=dict(zip(df_ar[ind_ar].unique(),df_en[ind_en].unique()))

#save to JSON, ensure_ascii=False keeps non-ASCII characters (like Arabic) in their original form instead of escaping them as \u0627
with open('values_dict_ar_to_en.json', 'w', encoding='utf-8') as f:
    json.dump(values_dict, f, ensure_ascii=False, indent=2)

### translate arabic to english

In [87]:
#translate the values from arabic to english
for col in values_dict:
    if col in df_ar.columns:
        df_ar[col] = df_ar[col].map(values_dict[col])

#rename columns to english
df_ar = df_ar.rename(columns=column_dict)

In [88]:
df_ar

Unnamed: 0,Indicator,Country,Nationality,Sex,Year,Value,Source,Sector,Education level
0,Net enrolment rate in primary education (percent),Somalia,Nationality Total,Male,2010,1.0,a,,
1,Net enrolment rate in primary education (percent),Somalia,Nationality Total,Female,2010,1.0,a,,
2,Net enrolment rate in primary education (percent),Somalia,Nationality Total,Both sexes,2010,1.0,a,,
3,Net enrolment rate in primary education (percent),Somalia,Nationals,Male,2010,1.0,a,,
4,Net enrolment rate in primary education (percent),Somalia,Nationals,Female,2010,1.0,a,,
5,Net enrolment rate in primary education (percent),Somalia,Nationals,Both sexes,2010,1.0,a,,
6,Net enrolment rate in primary education (percent),Somalia,Non-nationals,Male,2010,1.0,a,,
7,Net enrolment rate in primary education (percent),Somalia,Non-nationals,Female,2010,1.0,a,,
8,Net enrolment rate in primary education (percent),Somalia,Non-nationals,Both sexes,2010,1.0,a,,
9,Net enrolment rate in primary education (percent),Somalia,Nationality Total,Male,2011,,,,
