In [2]:
import pandas as pd
import numpy as np


pd.set_option('display.max_columns', None)  # Show all the columns
pd.set_option('display.max_rows', None)     # Show all the rows
pd.set_option('display.max_colwidth', None) # Do not truncate column content
pd.set_option('display.width', 1000)        # Prevent wrapping of wide DataFrames

In [41]:
f_path = 'C:/Users/511232/Desktop/DSS/Consolidated questionnaires'
file='Somalia_Education_en.xlsx'

sheets_dict = pd.read_excel(f_path+'/'+file, sheet_name=None)  # Reads all sheets at once
print(sheets_dict.keys())

dict_keys(['Education_1_a', 'Education_1_b', 'Education_2_a', 'Education_2_b', 'Education_2_c', 'Education_3', 'Education_4', 'Education_5', 'Education_6'])


In [None]:
'''loop through the sheets, isolate the row where theres source, remove it from the dataframe, reshape the dataframe
repeat the source row equivalent to the non-reshaped dataframe size, append to the df_list and finally concatenate into a single file'''

df_list=[]

for key, df in sheets_dict.items():
    
    print(f'processing sheet {key}')
    #read in the dataframe
    df=sheets_dict[key]

    #clean it from the null rows
    df1 = df.dropna(how='all')

    #isolate the row that contains source
    source_row = df1[df1.apply(lambda row: row.astype(str).str.contains('source', case=False)).any(axis=1)]

    #remove it from main DataFrame
    df_cleaned = df1.drop(source_row.index)

    #get the years column
    year_cols = [c for c in df_cleaned.columns if str(c).isdigit()]
    # year_cols = [c for c in df_cleaned.columns if isinstance(c, int)]
    print(f'years columns: {year_cols}')

    #reshape into long
    df_long = df_cleaned.melt(
        id_vars=[c for c in df_cleaned.columns if c not in year_cols],
        value_vars=year_cols,
        var_name='Year',
        value_name='Value'
    )

    #repeat years and source as much as the dataframe rows
    year_repeated = []
    source_repeated = []

    for year in year_cols:
        year_str = str(year)
        year_repeated.extend([year_str] * df_cleaned.shape[0])
        source_repeated.extend([source_row.iloc[0][year]] * df_cleaned.shape[0])

    # Create a new DataFrame with these repeated values
    source_column_df = pd.DataFrame({
        'Year_source': year_repeated,
        'Source': source_repeated
    })

    # Concatenate with df_long
    df_final = pd.concat([df_long.reset_index(drop=True), source_column_df], axis=1)
    df_final = df_final.drop('Year_source', axis=1)
    print('finished processing')
    print(df_final.head(3), '\n')

    #append to df_list
    df_list.append(df_final)
