In [23]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [24]:
# Get HSA folder names
data_path = Path.cwd().parent/'data'
HSAs = [folder.name for folder in data_path.iterdir() if folder.is_dir() and not folder.name.startswith('.')]

In [25]:
# Transpose and concat socioeconomic data for every subregion and HSA. Add columns - "HSA" and "Sub Region".
socioeco_df = pd.DataFrame()
for region in HSAs:
    region_path = data_path/region
    for subregion_path in tqdm(region_path.glob("*.csv")):
        subregion = subregion_path.name.split(".")[0]
        subregion_df = pd.read_csv(subregion_path,index_col=False).iloc[:-1]
        
        subregion_df.rename(columns={' Community Health Service Area_value':"Value"},inplace=True)
        subregion_df = subregion_df[['Indicator','Value']]
        
        subregion_df.reset_index(drop=True,inplace=True)
        subregion_trans_df = subregion_df.transpose()
        
        subregion_trans_df.columns = subregion_trans_df.iloc[0]
        subregion_trans_df.drop(subregion_trans_df.index[0],inplace=True)
        subregion_trans_df.reset_index(drop=True,inplace=True)
        
        subregion_trans_df['Region'] = region
        subregion_trans_df['Sub Region'] = subregion
        
        subregion_trans_df = subregion_trans_df.loc[:,~subregion_trans_df.columns.duplicated()]
        
        socioeco_df = pd.concat([socioeco_df,subregion_trans_df],axis=0,ignore_index=True)

42it [00:00, 46.62it/s]
43it [00:00, 47.80it/s]
28it [00:00, 45.00it/s]
43it [00:01, 37.93it/s]
29it [00:01, 28.24it/s]


In [27]:
# Create a processed folder
processed_data_path = data_path.parent/'processed'
processed_data_path.mkdir(exist_ok=True)

In [28]:
# Read indicator to column name mapper csv
map_cols = pd.read_csv(processed_data_path/'map_col_names.csv', names = ['col','red_col'])

In [29]:
# Convert the mapper dataframe to dictionary
col2red_col = dict(map_cols.values)

In [30]:
# Map these to the existing indicator names to column names as per the dictionary
socioeco_df.columns = socioeco_df.columns.map(col2red_col)

In [32]:
# Save the socioeconomic data as csv
socioeco_df.to_csv(processed_data_path/'socioeco_compiled.csv',index=False)