In [3]:
import pandas as pd
import numpy as np
import json


pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None) # Do not truncate column content
pd.set_option('display.width', 1000)        # Prevent wrapping of wide DataFrames

### for english questionnaire

In [4]:
f_path = 'C:/Users/511232/Desktop/DSS/Consolidated questionnaires'
file='Somalia_Education_en.xlsx'

sheets_dict = pd.read_excel(f_path+'/'+file, sheet_name=None)  # Reads all sheets at once
print(sheets_dict.keys())

dict_keys(['Education_1_a', 'Education_1_b', 'Education_2_a', 'Education_2_b', 'Education_2_c', 'Education_3', 'Education_4', 'Education_5', 'Education_6'])


In [11]:
'''loop through the sheets, isolate the row where theres source, remove it from the dataframe, reshape the dataframe
repeat the source row equivalent to the non-reshaped dataframe size, append to the df_list and finally concatenate into a single file'''

df_list=[]

for key, df in sheets_dict.items():
    
    print(f'processing sheet {key}')
    #read in the dataframe
    df=sheets_dict[key]

    #clean it from the null rows
    df1 = df.dropna(how='all')

    #get the row and column index where source appear
    source_row_index=df1[df1.isin(['Source', 'source']).any(axis=1)].index[0]
    #column where source appear
    source_col=df.columns[df1.isin(['Source', 'source']).any(axis=0)][0]
    #get the index of source_col
    source_col_index=df1.columns.get_loc(source_col)
    print('Source row and column index are: {source_row_index},{source_col_index}')

    #separating the dataframe between main dataframe and the source dataframe
    main_df=df1.iloc[0:source_row_index-1].reset_index(drop=True)
    source_df=df1.iloc[source_row_index-1 :,source_col_index :].reset_index(drop=True)


    #action plan: melt main dataframe, transpose the source dataframe and then merge together
    #get years columns
    year_columns = [col for col in df.columns if str(col).isdigit()]
    print(f'years columns are: {year_columns}')

    id_variables= list(set(main_df.columns) - set(year_columns))

    df_main_long = pd.melt(main_df, 
                    id_vars=id_variables,     # columns to keep
                    value_vars=year_columns,  # columns to unpivot
                    var_name='Year', 
                    value_name='Value',
                    ignore_index=True)

    #transpose source dataframe
    source_df_T=source_df.T.reset_index()

    cols=['Year','Source']
    #remove the 1st row
    source_df_T=source_df_T.iloc[1:]
    source_df_T.columns=cols

    #merge the 2 dataframes
    merged_data=pd.merge(df_main_long, source_df_T, on='Year')
    merged_data.head(3)
    #####################################################################################################
    
    #go through columns and print out the unique  values
    for col in [c for c in merged_data.columns if c not in ['Source', 'Value']]:
        print(f'for column {col}: {merged_data[col].unique()}')

    print('--------------------------------------------------------------/n/n')


    

processing sheet Education_1_a
Source row and column index are: {source_row_index},{source_col_index}
years columns are: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
for column Country: ['Somalia']
for column Indicator: ['Net enrolment rate in primary education (percent)']
for column Nationality: ['Nationality Total' 'Nationals' 'Non-nationals']
for column Sex: ['Male' 'Female' 'Both sexes']
for column Year: [2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
 2024]
--------------------------------------------------------------/n/n
processing sheet Education_1_b
Source row and column index are: {source_row_index},{source_col_index}
years columns are: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
for column Country: ['Somalia']
for column Indicator: ['Net enrolment rate in secondary education (percent)']
for column Nationality: ['Nationality Total' 'Nationals' 'Non-nationals']
f

### for arabic questionnaire

In [4]:
f_path = 'C:/Users/511232/Desktop/DSS/Consolidated questionnaires'
file='Somalia_Education_ar.xlsx'

sheets_dict = pd.read_excel(f_path+'/'+file, sheet_name=None)  # Reads all sheets at once
print(sheets_dict.keys())

dict_keys(['Education_1_a', 'Education_1_b', 'Education_2_a', 'Education_2_b', 'Education_2_c', 'Education_3', 'Education_4', 'Education_5', 'Education_6'])


In [None]:
'''loop through the sheets, isolate the row where theres source, remove it from the dataframe, reshape the dataframe
repeat the source row equivalent to the non-reshaped dataframe size, append to the df_list and finally concatenate into a single file'''

df_list=[]

for key, df in sheets_dict.items():
    
    print(f'processing sheet {key}')
    #read in the dataframe
    df=sheets_dict[key]

    #clean it from the null rows
    df1 = df.dropna(how='all')

    #get the row and column index where source appear
    source_row_index=df1[df1.isin(['المصدر','Source', 'source']).any(axis=1)].index[0]
    #column where source appear
    source_col=df.columns[df1.isin(['المصدر','Source', 'source']).any(axis=0)][0]
    #get the index of source_col
    source_col_index=df1.columns.get_loc(source_col)
    print('Source row and column index are: {source_row_index},{source_col_index}')

    #separating the dataframe between main dataframe and the source dataframe
    main_df=df1.iloc[0:source_row_index-1].reset_index(drop=True)
    source_df=df1.iloc[source_row_index-1 :,source_col_index :].reset_index(drop=True)


    #action plan: melt main dataframe, transpose the source dataframe and then merge together
    #get years columns
    year_columns = [col for col in df.columns if str(col).isdigit()]
    print(f'years columns are: {year_columns}')

    id_variables= list(set(main_df.columns) - set(year_columns))

    df_main_long = pd.melt(main_df, 
                    id_vars=id_variables,     # columns to keep
                    value_vars=year_columns,  # columns to unpivot
                    var_name='Year', 
                    value_name='Value',
                    ignore_index=True)

    #transpose source dataframe
    source_df_T=source_df.T.reset_index()

    cols=['Year','Source']
    #remove the 1st row
    source_df_T=source_df_T.iloc[1:]
    source_df_T.columns=cols

    #merge the 2 dataframes
    merged_data=pd.merge(df_main_long, source_df_T, on='Year')
    merged_data.head(3)
    #####################################################################################################
    
    #go through columns and print out the unique  values
    for col in [c for c in merged_data.columns if c not in ['Source', 'Value']]:
        print(f'for column {col}: {merged_data[col].unique()}')

    print('--------------------------------------------------------------/n/n')

### create a dictionary for translation

In [6]:
df_en=pd.read_excel('C:/Users/511232/Desktop/DSS/Consolidated questionnaires/codes/consolidated_data_en.xlsx')
df_ar=pd.read_excel('C:/Users/511232/Desktop/DSS/Consolidated questionnaires/codes/consolidated_data_ar.xlsx')
df_ar_translated=df_ar.copy()

In [7]:
translation_df=pd.read_excel('translation dict.xlsx')
translation_df.head(2)

Unnamed: 0,col_en,val_en,col_ar,val_ar
0,Indicator,Net enrolment rate in primary education (percent),المؤشر,معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية)
1,Indicator,Net enrolment rate in secondary education (percent),المؤشر,معدل الإلتحاق الصافي في مرحلة التعليم الثانوي (نسبة مئوية)


In [15]:
#create dictionaries for columns names
col_dict_en_ar=dict(zip(translation_df['col_en'].unique(),translation_df['col_ar'].unique()))
col_dict_ar_en={v:k for k,v in col_dict_en_ar.items()}

In [None]:
#create dictionaries for english to arabic and arabic to english lables for columns
labels_dict_en_ar=translation_df.groupby(['col_en']).apply(lambda x: dict(zip(x['val_en'],x['val_ar']))).to_dict()
labels_dict_ar_en=translation_df.groupby(['col_ar']).apply(lambda x: dict(zip(x['val_ar'],x['val_en']))).to_dict()

### translate arabic to english

In [17]:
df_ar.head(2)

Unnamed: 0,المؤشر,الدولة,المواطنة,الجنس,Year,Value,Source,القطاع,المرحلة التعليمية
0,معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية),الصومال,مجموع المواطنين وغير المواطنين,ذكور,2010,1.0,a,,
1,معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية),الصومال,مجموع المواطنين وغير المواطنين,إناث,2010,1.0,a,,


In [19]:
df_ar_translated=df_ar.copy()

#translate the values from arabic to english
for col in labels_dict_ar_en.keys():
        try:
                df_ar_translated[col] = df_ar_translated[col].map(labels_dict_ar_en[col])
        except:
                print(f'column {col} not in the dataframe columns ')

#rename columns to english
df_ar_translated = df_ar_translated.rename(columns=col_dict_ar_en)

column السنة not in the dataframe columns 
column العدد not in the dataframe columns 
column المصدر not in the dataframe columns 


In [20]:
df_ar_translated.head(2)

Unnamed: 0,Indicator,Country,Nationality,Sex,Year,Value,Source,Sector,Education level
0,Net enrolment rate in primary education (percent),Somalia,Nationality Total,Male,2010,1.0,a,,
1,Net enrolment rate in primary education (percent),Somalia,Nationality Total,,2010,1.0,a,,


In [27]:
for col in df_ar_translated.columns.difference(['Year', 'Value', 'Source']):
    print(df_ar_translated[col].unique(),'\n')

['Somalia'] 

[nan 'Primary' 'Preparatory' 'Secondary' 'Tertiary' 'All levels'] 

['Net enrolment rate in primary education (percent)'
 'Net enrolment rate in secondary education (percent)'
 'Pupil-teacher ratio (percent)' 'Adult literacy rates'
 'Youth literacy rates'
 'Expenditure on education as a percentage of total government expenditure  (percent)'
 'Government expenditure on education as a percentage of GDP (percent)'] 

['Nationality Total' 'Nationals' 'Non-nationals' nan] 

[nan 'Public' 'Private' 'Sector Total'] 

['Male' nan 'Both sexes'] 



#### check schema with pandera and rapidfuzz to match with possible different column names

In [10]:
import pandas as pd
from rapidfuzz import process
import pandera as pa
from pandera import Column, DataFrameSchema, Check
from pandera.errors import SchemaError
from textwrap import fill

In [92]:
STANDARD_SCHEMA = {
'Indicator': ['indicator','Ind','مؤشر','المؤشر'],
'Country': ['الدولة','البلد','country'],
'Nationality': ['المواطنة','المواطنية','nationality','nationalities'],
'Sex': ['sex','gender','الجنس'],
'Year': ['year','time','Time','سنة','السنة'],
'Value': ['value','number','Number','الرقم','عدد', 'العدد'],
'Comments': ['comments','notes','ملاحظة','المصدر','Source'],
'Sector': ['sector','القطاع'],
'Education level': ['edu level','Education_level','المرحلة التعليمية','التعليم']
}


In [93]:
#map columns to col names using fuzzy match
col_names = {}
for col, aliases in STANDARD_SCHEMA.items():

    match, score, _ = process.extractOne(col, df_ar_translated.columns.tolist() + aliases)
    
    if score > 80:
        print(f'{match} has a score of {score} for column {col}')
        col_names[match] = col

#rename column names according to schema
df_ar_translated=df_ar_translated.rename(col_names)

df_ar_translated.columns

Indicator has a score of 100.0 for column Indicator
Country has a score of 100.0 for column Country
Nationality has a score of 100.0 for column Nationality
Sex has a score of 100.0 for column Sex
Year has a score of 100.0 for column Year
Value has a score of 100.0 for column Value
comments has a score of 87.5 for column Comments
Sector has a score of 100.0 for column Sector
Education level has a score of 100.0 for column Education level


Index(['Indicator', 'Country', 'Nationality', 'Sex', 'Year', 'Value', 'Source', 'Sector', 'Education level'], dtype='object')

#### validate the dataframe column labels

In [94]:
df_ar_translated.dtypes

Indicator           object
Country             object
Nationality         object
Sex                 object
Year                 int64
Value              float64
Source              object
Sector              object
Education level     object
dtype: object

In [28]:
translation_df=pd.read_excel('translation dict.xlsx')
translation_df.head(2)

Unnamed: 0,col_en,val_en,col_ar,val_ar
0,Indicator,Net enrolment rate in primary education (percent),المؤشر,معدل الإلتحاق الصافي في مرحلة التعليم الإبتدائي (الأساسي) (نسبة مئوية)
1,Indicator,Net enrolment rate in secondary education (percent),المؤشر,معدل الإلتحاق الصافي في مرحلة التعليم الثانوي (نسبة مئوية)


In [38]:
list(translation_df.loc[translation_df['col_en']=='Education level', 'val_en'].unique())

['Primary', 'Preparatory', 'Secondary', 'Tertiary', 'All levels']

In [43]:
allowed_indicator_values=list(translation_df.loc[translation_df['col_en']=='Indicator', 'val_en'].unique())
allowed_country_values=list(translation_df.loc[translation_df['col_en']=='Country', 'val_en'].unique())
allowed_nationality_values=list(translation_df.loc[translation_df['col_en']=='Nationality', 'val_en'].unique())
allowed_sex_values=list(translation_df.loc[translation_df['col_en']=='Sex', 'val_en'].unique())
allowed_year_values=[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024]
allowed_sector_values=list(translation_df.loc[translation_df['col_en']=='Sector', 'val_en'].unique())
allowed_edulevel_values=list(translation_df.loc[translation_df['col_en']=='Education level', 'val_en'].unique())

#define schema
schema = DataFrameSchema({
    'Indicator': Column(str, checks=Check.isin(allowed_indicator_values)),
    'Country': Column(str, checks=Check.isin(allowed_country_values)),
    'Nationality': Column(str, checks=Check.isin(allowed_nationality_values),nullable=True),
    'Sex': Column(str, checks=Check.isin(allowed_sex_values),nullable=True),
    'Year': Column(int, checks=Check.isin(allowed_year_values)),
    'Sector': Column(str, checks=Check.isin(allowed_sector_values),nullable=True),
    'Education level': Column(str, checks=Check.isin(allowed_edulevel_values),nullable=True)     
})

#validate the dataframe
try:
    validated_df = schema.validate(df_ar_translated)
except SchemaError as err:
    wrapped_error = fill(str(err.args), width=150)
    print(wrapped_error)

("Column 'Indicator' failed element-wise validator number 0: isin(['Net enrolment rate in primary education (percent)', 'Net enrolment rate in
secondary education (percent)', 'Pupil-teacher ratio (percent)', 'Adult literacy rates', 'Youth literacy rates', 'Expenditure on education as a
percentage of total government expenditure  (percent)', 'Government expenditure on education as a percentage of GDP (percent)']) failure cases: wrong
ind",)


In [42]:
df_ar_translated.loc[0,'Indicator']='wrong ind'
df_ar_translated.head()

Unnamed: 0,Indicator,Country,Nationality,Sex,Year,Value,Source,Sector,Education level
0,wrong ind,Somalia,Nationality Total,Male,2010,1.0,a,,
1,Net enrolment rate in primary education (percent),Somalia,Nationality Total,,2010,1.0,a,,
2,Net enrolment rate in primary education (percent),Somalia,Nationality Total,Both sexes,2010,1.0,a,,
3,Net enrolment rate in primary education (percent),Somalia,Nationals,Male,2010,1.0,a,,
4,Net enrolment rate in primary education (percent),Somalia,Nationals,,2010,1.0,a,,
