In [1]:
import pandas as pd
import numpy as np
from pypdf import PdfReader
import re
import os

In [2]:
reader = PdfReader('./reference/Indonesian_health_survey_2023_indonesian_language.pdf')
reader_english = PdfReader('./reference/Indonesian_health_survey_2023_english_language.pdf')

province_code_mapping ={
    'Aceh': 11, 'Sumatera Utara': 12,'Sumatra Utara': 12, 'Sumatera Barat': 13,'Sumatra Barat': 13,
    'Riau': 14,'Jambi': 15,'Sumatera Selatan': 16,'Sumatra Selatan': 16,
    'Bengkulu': 17,'Lampung': 18,'Kep.Bangka Belitung':19,'Kep. Bangka Belitung': 19,
    'Bangka Belitung': 19, 'Kepulauan Riau': 21,
    'DKI Jakarta': 31,'Jawa Barat': 32,
    'Jawa Tengah': 33,'DI Yogyakarta': 34,
    'Jawa Timur': 35,'Banten': 36,
    'Bali': 51,'Nusa Tenggara Barat': 52,'Nusa Tenggara Timur': 53,
    'Kalimantan Barat': 61,'Kalimantan Tengah': 62,
    'Kalimantan Selatan': 63,'Kalimantan Timur': 64,
    'Kalimantan Utara': 65,'Sulawesi Utara': 71,
    'Sulawesi Tengah': 72,'Sulawesi Selatan': 73,
    'Sulawesi Tenggara': 74,'Gorontalo': 75,
    'Sulawesi Barat': 76,'Maluku Utara': 81,
    'Maluku': 82,'Papua Barat Daya': 96,  
    'Papua Barat': 92,'Papua Tengah': 94, 
    'Papua Pegunungan': 95,'Papua Selatan': 93,
    'Papua': 91, 'INDONESIA': 0 
}
province_name_cleaning = {
    11: 'Aceh', 12: 'Sumatera Utara', 13: 'Sumatera Barat', 
    14: 'Riau', 15: 'Jambi', 16: 'Sumatera Selatan', 17: 'Bengkulu', 
    18: 'Lampung', 19: 'Kep. Bangka Belitung', 21: 'Kepulauan Riau', 
    31: 'DKI Jakarta', 32: 'Jawa Barat', 33: 'Jawa Tengah', 34: 'DI Yogyakarta', 
    35: 'Jawa Timur', 36: 'Banten', 51: 'Bali', 52: 'Nusa Tenggara Barat', 
    53: 'Nusa Tenggara Timur', 61: 'Kalimantan Barat', 62: 'Kalimantan Tengah', 
    63: 'Kalimantan Selatan', 64: 'Kalimantan Timur', 65: 'Kalimantan Utara', 71: 'Sulawesi Utara', 
    72: 'Sulawesi Tengah', 73: 'Sulawesi Selatan', 74: 'Sulawesi Tenggara', 75: 'Gorontalo', 
    76: 'Sulawesi Barat', 81: 'Maluku Utara', 82: 'Maluku', 96: 'Papua Barat Daya', 92: 'Papua Barat', 
    94: 'Papua Tengah', 95: 'Papua Pegunungan', 93: 'Papua Selatan', 91: 'Papua', 0: 'INDONESIA'}

province_list = ['Aceh', 'Sumatera Utara', 'Sumatera Barat','Sumatra Utara', 'Sumatra Barat','Riau', 'Jambi', 
            'Sumatera Selatan','Sumatra Selatan', 'Bengkulu', 'Lampung','Kep.Bangka Belitung', 'Kep. Bangka Belitung', 'Bangka Belitung',
            'Kepulauan Riau', 'DKI Jakarta', 'Jawa Barat', 'Jawa Tengah', 'DI Yogyakarta', 
            'Jawa Timur', 'Banten', 'Bali', 'Nusa Tenggara Barat', 'Nusa Tenggara Timur', 
            'Kalimantan Barat', 'Kalimantan Tengah', 'Kalimantan Selatan', 'Kalimantan Timur', 
            'Kalimantan Utara', 'Sulawesi Utara', 'Sulawesi Tengah', 'Sulawesi Selatan', 'Sulawesi Tenggara', 
            'Gorontalo', 'Sulawesi Barat', 'Maluku Utara','Maluku', 'Papua Barat Daya', 'Papua Barat',   
             'Papua Tengah', 'Papua Pegunungan','Papua Selatan','Papua', 'INDONESIA']

def extract_filename(page_file):
    #### Title file
    page_title = reader_english.pages[page_file-1].extract_text()

    # Initialize variables
    start_title = False
    list_title = []

    split_title = page_title.splitlines()
    
    ## Looping for deleting any list value before table name
    for i, value in enumerate(split_title):
        if "Table" in value or "Tabel" in value:
            # Return the list starting from that index
            split_title =  split_title[i:]
            break
    split_title = [value.replace("Tabel", "Table") for value in split_title]
    ## looping for take specific list of title
    for line in split_title:
        # Check for the start of the section
        if "Table" in line:
            start_title = True
        # Collect lines if within the range
        if start_title:
            list_title.append(line.strip())
        # Stop when reaching the end
        if "SKI" in line:
            break
    
    ## cleaning title name, delete some of special characters
    save_filename = " ".join(filter(None, list_title))
    save_filename = re.sub(r'/', ' or ', save_filename)
    save_filename = re.sub(r'(\bTable \d+\.\d+)\.', r'\1', save_filename)
    save_filename = re.sub(r'&', 'and', save_filename)
    save_filename = re.sub(r'(Table \d\.)\s', r'\1', save_filename)
    save_filename = re.sub(r'>', 'more than', save_filename)
    save_filename = re.sub(r'<', 'less than', save_filename)
    save_filename = re.sub(r'Table (\d+)\.\s*(\d+)', r'Table \1.\2', save_filename)

    filename_short ={
        362: 'Table 9.5 Proportion of Dental and Oral Problems in the last 1 year aged ≥ 3 who Received Treatment from Health Workers by Province, 2023 SKI',
        391: 'Table 10.3 Proportion of Respondents who Received Medicine Information from Health Workers and Types of Medicine Information Obtained by Province, 2023 SKI',
        394: 'Table 10.5 Proportion of Respondents who knew the Medicines Classification Purchased by Province, 2023 SKI'
    }

    if page_file in filename_short.keys():
        save_filename = filename_short[page_file]

    return save_filename

def extract_province(save_filename,page_file,column_name):

    page = reader.pages[page_file].extract_text()
    # Initialize variables
    start_found = False
    lines = []

    page_line = page.splitlines()

    # Read the file line by line

    for line in page_line:
        # Check for the start of the section
        if "Aceh" in line:
            start_found = True
        # Collect lines if within the range
        if start_found:
            lines.append(line.strip())
        # Stop when reaching the end
        if "INDONESIA" in line:
            break
    
    lines = [re.sub(r'(\d),(?=\s|\D)', r'\1', line) for line in lines]
    lines = [re.sub(r'\s{2,}', ' ', line) for line in lines]
    lines = [re.sub(r'(?<!\s)-|-(?!\s)', ' - ', line) for line in lines]
    lines = [re.sub(r',\s*-', ' -', text) for text in lines]
    lines = [re.sub(r"(?<=\d,\d),\d*", "", text) for text in lines]
    
    # cleaned lines for specific case
    clean_lines = []
    for line in lines:
        if re.findall(r'\b\d{1,3},\d{1,3},\d{1,3}\b', line):
            # Replace commas correctly between the second and third number group
            fixed_line = re.sub(r'(\d{1,3},\d{1,1})(\d{1,3},\d{1,3})', r'\1 - \2', line)
            clean_lines.append(fixed_line)
        elif next((prov for prov in province_list if re.match(r'^\b' + re.escape(prov) + r'\b', line) and len(prov.split()) >= 1), None)=='DKI Jakarta' and page_file == 116:
            fixed_line = re.sub(r'0,0 - 2,0', '0,0 0,0 - 0,0 2,0', line)
            clean_lines.append(fixed_line)
        else:
            clean_lines.append(line)    

    data=[]
    for line in clean_lines:

        # Extract province name by matching against province_list
        province = next((prov for prov in province_list if line.startswith(prov)), None)
        if province:
            # Remove the province name from the start of the line
            line_without_province = line[len(province):].strip()
            rest = [re.sub(r'[^a-zA-Z0-9\s.,]', '', val) for val in line_without_province.split()]
            rest_cleaned =  [item for item in rest if item!='']

            # Combine province name with the rest of the data
            data.append([province] + rest_cleaned)

    df = pd.DataFrame(data,columns = column_name)
    
    # exceptional pages difference
    if page_file == 248:
        for i in column_name[1:len(column_name)-1]:
            df[i] =  df[i].str.replace('.',',')

    for i in column_name:
        if i == 'province':
            df[i] = df[i].astype(str)
        elif df[i].str.contains(r",", regex=True).sum() > 0:
            df[i] = df[i].replace("NA", np.nan).str.replace(",",".").astype(float)
        else:
            df[i] = df[i].str.replace(".","").astype(int)

    # cleanng data province and add province code
    df['province_code'] =  df['province'].map(province_code_mapping)
    df['province'] = df['province_code'].map(province_name_cleaning)
    
    # Move column 'province_code' from right to left
    column_to_move = 'province_code'
    columns = [column_to_move] + [col for col in df.columns if col != column_to_move]
    df = df[columns]
    filename_save = f"./datasets/{save_filename}.csv"
    return df.to_csv(filename_save, index=False)

# ----- function province to show data ---- 

def show_province(page_file,column_name):


    # Initialize variables
    page = reader.pages[page_file].extract_text()
    start_found = False
    lines = []

    page_line = page.splitlines()

    # Read the file line by line

    for line in page_line:
        # Check for the start of the section
        if "Aceh" in line:
            start_found = True
        # Collect lines if within the range
        if start_found:
            lines.append(line.strip())
        # Stop when reaching the end
        if "INDONESIA" in line:
            break

    lines = [re.sub(r'(\d),(?=\s|\D)', r'\1', line) for line in lines]
    lines = [re.sub(r'\s{2,}', ' ', line) for line in lines]
    lines = [re.sub(r'(?<!\s)-|-(?!\s)', ' - ', line) for line in lines]
    lines = [re.sub(r',\s*-', ' -', text) for text in lines]
    lines = [re.sub(r"(?<=\d,\d),\d*", "", text) for text in lines]

    # cleaned lines for specific case
    clean_lines = []
    for line in lines:
        if re.findall(r'\b\d{1,3},\d{1,3},\d{1,3}\b', line):
            # Replace commas correctly between the second and third number group
            fixed_line = re.sub(r'(\d{1,3},\d{1,1})(\d{1,3},\d{1,3})', r'\1 - \2', line)
            clean_lines.append(fixed_line)
        elif next((prov for prov in province_list if re.match(r'^\b' + re.escape(prov) + r'\b', line) and len(prov.split()) >= 1), None)=='DKI Jakarta' and page_file == 116:
            fixed_line = re.sub(r'0,0 - 2,0', '0,0 0,0 - 0,0 2,0', line)
            clean_lines.append(fixed_line)
        else:
            clean_lines.append(line)

    data=[]
    for line in clean_lines:

        # Extract province name by matching against province_list
        province = next((prov for prov in province_list if line.startswith(prov)), None)
        if province:
            # Remove the province name from the start of the line
            line_without_province = line[len(province):].strip()
            rest = [re.sub(r'[^a-zA-Z0-9\s.,]', '', val) for val in line_without_province.split()]
            rest_cleaned =  [item for item in rest if item!='']

            # Combine province name with the rest of the data
            data.append([province] + rest_cleaned)

    df = pd.DataFrame(data,columns = column_name)

    # exceptional pages difference
    if page_file == 248:
        for i in column_name[1:len(column_name)-1]:
            df[i] =  df[i].str.replace('.',',')

    for i in column_name:
        if i == 'province':
            df[i] = df[i].astype(str)
        elif df[i].str.contains(r",", regex=True).sum() > 0:
            df[i] = df[i].replace("NA", np.nan).str.replace(",",".").astype(float)
        else:
            df[i] = df[i].str.replace(".","").astype(int)

    # cleanng data province and add province code
    df['province_code'] =  df['province'].map(province_code_mapping)
    df['province'] = df['province_code'].map(province_name_cleaning)
    
    # Move column 'province_code' from right to left
    column_to_move = 'province_code'
    columns = [column_to_move] + [col for col in df.columns if col != column_to_move]
    df = df[columns]
    
    return df


In [3]:
province_dict = {
    49: ['province','census_block_target','census_block_visited','census_block_response_rate','household_target','household_visited','household_response_rate','household_member_visited','household_member_interview_measurement'],
    50: ['province','census_block_target','census_block_visited','census_block_response_rate','household_target','household_visited','household_response_rate','household_member_visited','household_member_examined'],
    51: ['province','census_block_target','census_block_visited','census_block_response_rate','household_target','household_visited','household_response_rate','household_member_visited','household_member_measured'],
    55: ['province','lowes_q1','lower_middle_q2','middle_q3','upper_middle_q4','highest_q5'],
    59: ['province','in_district_or_city','nearest_district_or_city','none','unaware','weighted_n'],
    61: ['province','easy_access','easy_access_95%_CI_lower','easy_access_95%_CI_upper','difficult_access','difficult_access_95%_CI_lower','difficult_access_95%_CI_upper','very_difficult_access','very_difficult_access_95%_CI_lower','very_difficult_access_95%_CI_upper','weighted_n'],
    63: ['province','in_district_or_city','nearest_district_or_city','none','unaware','weighted_n'],
    65: ['province','easy_access','easy_access_95%_CI_lower','easy_access_95%_CI_upper','difficult_access','difficult_access_95%_CI_lower','difficult_access_95%_CI_upper','very_difficult_access','very_difficult_access_95%_CI_lower','very_difficult_access_95%_CI_upper','weighted_n'],
    67: ['province','in_district_or_city','nearest_district_or_city','none','unaware','weighted_n'],
    69: ['province','easy_access','easy_access_95%_CI_lower','easy_access_95%_CI_upper','difficult_access','difficult_access_95%_CI_lower','difficult_access_95%_CI_upper','very_difficult_access','very_difficult_access_95%_CI_lower','very_difficult_access_95%_CI_upper','weighted_n'],
    71: ['province','in_district_or_city','nearest_district_or_city','none','unaware','weighted_n'],
    73: ['province','easy_access','easy_access_95%_CI_lower','easy_access_95%_CI_upper','difficult_access','difficult_access_95%_CI_lower','difficult_access_95%_CI_upper','very_difficult_access','very_difficult_access_95%_CI_lower','very_difficult_access_95%_CI_upper','weighted_n'],
    75: ['province','puskesmas','weighted_n_puskesmas','independent_clinic','weighted_n_independent_clinic','lab','weighted_n_lab','hospital','weighted_n_hospital'],
    77: ['province','puskesmas','independent_clinic','hospital','none','weighted_n'],
    80: ['province','easy_access','comprehensive_facilities','free_service_costs','communicative_health_workers','comfortable_room','quick_precise_accurate_services','services_meet_expectation','weighted_n'],
    82: ['province','easy_access','comprehensive_facilities','free_service_costs','communicative_health_workers','comfortable_room','quick_precise_accurate_services','services_meet_expectation','weighted_n'],
    84: ['province','easy_access','comprehensive_facilities','free_service_costs','communicative_health_workers','comfortable_room','quick_precise_accurate_services','services_meet_expectation','weighted_n'],
    88: ['province','puskesmas','independent_clinic','lab','hospital','weighted_n'],
    90: ['province','puskesmas','independent_clinic','lab','hospital','weighted_n'],
    93: ['province','ukbm','ukbm_95%_CI_lower','ukbm_95%_CI_upper','public_place','public_place_95%_CI_lower','public_place_95%_CI_upper','others','others_95%_CI_lower','others_95%_CI_upper','weighted_n'],
    95: ['province''ukbm''public_place''others''weighted_n'],
    97: ['province''yes_have_accessed''never_accessed''weighted_n'],
    99: ['province''yes_have_accessed''never_accessed''weighted_n'],
    102: ['province','bpjs_pbi','bpjs_non_pbi','jamkesda','private_insurance','others','bpjs_non_pbi_and_private_insurance','others_combinations','no_health_insurance','weighted_n'],
    105: ['province''have_utilized''never_utilized''weighted_n'],
    107: ['province','registration','healt_info_and_education','clinical_consultation','telepharmacy_and_preparation_lab_or_radiology','weighted_n'],
    110: ['province','bottled_water','refiled_water','piped_water','pump_well','protected_dug_well','unprotected_dug_well','protected_spring','unprotected_spring','rainwater_harvesting','surface_water','water_hydrant','water_terminal','purchased_retail_water','weighted_n'],
    112: ['province','in_house','within_premise','outside_premise','weighted_n'],
    114: ['province','less_than_equal_30_minute','more_than_30_minute','weighted_n'],
    116: ['province','no_access','no_access_95%_CI_lower','no_access_95%_CI_upper','inadequate_access','inadequate_access_95%_CI_lower','inadequate_access_95%_CI_upper','adequate_limited_access','adequate_limited_access_95%_CI_lower','adequate_limited_access_95%_CI_upper','adequate_basic_access','adequate_basic_access_95%_CI_lower','adequate_basic_access_95%_CI_upper','weighted_n'],
    118: ['province','turbid','colored','has_taste','foamy','smelly','weighted_n'],
    120: ['province','boiled','filtered','treated_with_alum','treated_with_clorine','uv_light','treated_with_lime','weighted_n'],
    122: ['province','covered_pot','uncovered_pot','container_with_small_open','gallon','others','weighted_n'],
    126: ['province','available_use_household_only','available_use_household_share','available_use_public','available_not_use','no_facility','weighted_n'],
    128: ['province','in_house','within_premise','outside_premise','weighted_n'],
    130: ['province''goose_neck''dammed_up_without_cover''weighted_n'],
    132: ['province','ipal','septic_tank','tradiotional_pit','covered_pit','uncovered_pit','lake_pond_river','field_or_garden','others','weighted_n'],
    134: ['province''from_1_to_6_times''more_than_6_times''weighted_n'],
    136: ['province','open_defecating','open_defecating_95%_CI_lower','open_defecating_95%_CI_upper','closed_defecating','closed_defecating_95%_CI_lower','closed_defecating_95%_CI_upper','inadequate_access','inadequate_access_95%_CI_lower','inadequate_access_95%_CI_upper','shared_inadequate_access','shared_inadequate_access_95%_CI_lower','shared_inadequate_access_95%_CI_upper','individual_adequate_access','individual_adequate_access_95%_CI_lower','individual_adequate_access_95%_CI_upper','safe_access','safe_access_95%_CI_lower','safe_access_95%_CI_upper','weighted_n'],
    139: ['province','closed_containment','open_containment','no_containment','direct_discharge_to_rivers','weighted_n'],
    141: ['province','closed_containment','open_containment','no_containment','direct_discharge_to_rivers','weighted_n'],
    143: ['province','yes','yes_95%_CI_lower','yes_95%_CI_upper','no','no_95%_CI_lower','no_95%_CI_upper','weighted_n'],
    146: ['province','closed_waste_containers_only','open_waste_containers_only','no_waste_containers','both_open_and_closed','weighted_n'],
    148: ['province','collected_by_personel','disposed_at_MRF','recycled','composted','deposited_at_waste_banks','disposed_in_rivers','burned','buried','disposed_improperly','weighted_n'],
    150: ['province','good','good_95%_CI_lower','good_95%_CI_upper','poor','poor_95%_CI_lower','poor_95%_CI_upper','weighted_n'],
    153: ['province','available_inside_house','available_outside_house','no_handwashing_facility','not_allowed_to_observe','weighted_n'],
    155: ['province','sink_tap','bucket_barrel_with_spout','bucket_barrel_with_water_dipper','others','weighted_n'],
    157: ['province','running_water','no_running_water','no_water','weighted_n'],
    159: ['province''yes''no''weighted_n'],
    161: ['province','no_access','no_access_95%_CI_lower','no_access_95%_CI_upper','limited_access','limited_access_95%_CI_lower','limited_access_95%_CI_upper','basic_hygiene_access','basic_hygiene_access_95%_CI_lower','basic_hygiene_access_95%_CI_upper','weighted_n'],
    164: ['province','concrete_or_asphalt','clay_tile','asbestos','zinc','bamboo','wood_shingles','straw_leaves_palm','other','proper_roof','weighted_n'],
    166: ['province','concrete_grc_board','gypsum','asbestos','wood_polywood_or_bamboo','pcv','others','none','weighted_n'],
    168: ['province','marble_granite','ceramic','parquet_vinyl_carpet','tiles_terazzo','wood_board','cement_red_brick','bamboo','soil','others','proper_flooring','weighted_n'],
    170: ['province','wall','plastered_woven_bamboo_wire','wood_board','bamboo_woven_bamboo','others','proper_wall','weighted_n'],
    172: ['province','living_space_more_than_7m_square_percapita','safe_drinking_water','adequate_sanitation','adequate_house_durability','adequate_housing','adequate_housing_95%_CI_lower','adequate_housing_95%_CI_upper','weighted_n'],
    174: ['province','springkling_larvicide_powder_in_water_storage','installing_mosquito_screens_on_home_ventilation','draining_bathub_buckets','covering_household_water_storage','eliminating_used_items','weighted_n'],
    176: ['province','no_3m_or_3m_plus_efforts','no_3m_or_3m_plus_efforts_95%_CI_lower','no_3m_or_3m_plus_efforts_95%_CI_upper','3m_efforts','3m_efforts_95%_CI_lower','3m_efforts_95%_CI_upper','3m_plus_efforts','3m_plus_efforts_95%_CI_lower','3m_plus_efforts_95%_CI_upper','weighted_n'],
    178: ['province','less_than_once_a_week','once_in_a_week','once_to_twice_in_month','n_a','weighted_n'],
    182: ['province','symptoms','symptoms_95%_CI_lower','symptoms_95%_CI_upper','symptoms_plus_diagnosis','symptoms_plus_diagnosis_95%_CI_lower','symptoms_plus_diagnosis_95%_CI_upper','weighted_n'],
    185: ['province','depression','depression_95%_CI_lower','depression_95%_CI_upper','weighted_n'],
    188: ['province','mental_health_issues','mental_health_issues_95%_CI_lower','mental_health_issues_95%_CI_upper','weighted_n'],
    190: ['province','having_suicidal_thoughts','having_suicidal_thoughts_95%_CI_lower','having_suicidal_thoughts_95%_CI_upper','weighted_n'],
    195: ['province','depress_receiving_treatment','depress_receiving_treatment_95%_CI_lower','depress_receiving_treatment_95%_CI_upper','weighted_n'],
    199: ['province','diagnosis','diagnosis_95%_CI_lower','diagnosis_95%_CI_upper','diagnosis_or_symptoms','diagnosis_or_symptoms_95%_CI_lower','diagnosis_or_symptoms_95%_CI_upper','weighted_n'],
    201: ['province','diagnosis','diagnosis_95%_CI_lower','diagnosis_95%_CI_upper','diagnosis_or_symptoms','diagnosis_or_symptoms_95%_CI_lower','diagnosis_or_symptoms_95%_CI_upper','weighted_n'],
    204: ['province','diagnosis','diagnosis_95%_CI_lower','diagnosis_95%_CI_upper','diagnosis_or_symptoms','diagnosis_or_symptoms_95%_CI_lower','diagnosis_or_symptoms_95%_CI_upper','weighted_n'],
    206: ['province','diagnosis','diagnosis_95%_CI_lower','diagnosis_95%_CI_upper','diagnosis_or_symptoms','diagnosis_or_symptoms_95%_CI_lower','diagnosis_or_symptoms_95%_CI_upper','weighted_n'],
    209: ['province','diagnosis','diagnosis_95%_CI_lower','diagnosis_95%_CI_upper','diagnosis_or_symptoms','diagnosis_or_symptoms_95%_CI_lower','diagnosis_or_symptoms_95%_CI_upper','weighted_n'],
    211: ['province','diagnosis','diagnosis_95%_CI_lower','diagnosis_95%_CI_upper','diagnosis_or_symptoms','diagnosis_or_symptoms_95%_CI_lower','diagnosis_or_symptoms_95%_CI_upper','weighted_n'],
    213: ['province','prevalence_pulmonary_tb','prevalence_pulmonary_tb_95%_CI_lower','prevalence_pulmonary_tb_95%_CI_upper','weighted_n'],
    220: ['province','contacts_pulmonary_tb','contacts_pulmonary_tb_95%_CI_lower','contacts_pulmonary_tb_95%_CI_upper','weighted_n'],
    225: ['province','prevalence_hepatitis','prevalence_hepatitis_95%_CI_lower','prevalence_hepatitis_95%_CI_upper','weighted_n'],
    232: ['province','prevalence_malaria','prevalence_malaria_95%_CI_lower','prevalence_malaria_95%_CI_upper','weigted_n_malaria','treat_ACT_1day','treat_ACT_14day','others','weighted_n_treatment'],
    239: ['province','prevalence_dhf','prevalence_dhf_95%_CI_lower','prevalence_dhf_95%_CI_upper','weighted_n'],
    242: ['province','prevalence_faliariasis','prevalence_faliariasis_95%_CI_lower','prevalence_faliariasis_95%_CI_upper','weighted_n'],
    244: ['province','receive_mda_faliariasis','receive_mda_faliariasis_95%_CI_lower','receive_mda_faliariasis_95%_CI_upper','weighted_n'],
    246: ['province','take_medicine_faliariasis','take_medicine_faliariasis_95%_CI_lower','take_medicine_faliariasis_95%_CI_upper','weighted_n'],
    248: ['province','didnt_feel_sick','afraid_side_effect','cant_take_large_pils','curently_iil','forgot','weighted_n'],
    250: ['province','currently_pregnant','not_born_yet','not_present','not_given','unaware_distribution','afraid','refused','weighted_n'],
    262: ['province','asthma_based_on_doctor_diagnosis','asthma_based_on_doctor_diagnosis_95%_CI_lower','asthma_based_on_doctor_diagnosis_95%_CI_upper','weighted_n'],
    264: ['province','asthma_recurrence_in_12_month','asthma_recurrence_in_12_month_95%_CI_lower','asthma_recurrence_in_12_month_95%_CI_upper','weighted_n'],
    267: ['province','cancer_based_doctor_diagnosis','cancer_based_doctor_diagnosis_95%_CI_lower','cancer_based_doctor_diagnosis_95%_CI_upper','weighted_n'],
    273: ['province','diabetes_mellitus_based_doctor_diagnosis','diabetes_mellitus_based_doctor_diagnosis_95%_CI_lower','diabetes_mellitus_based_doctor_diagnosis_95%_CI_upper','weighted_n'],
    275: ['province','diabetes_mellitus_based_doctor_diagnosis','diabetes_mellitus_based_doctor_diagnosis_95%_CI_lower','diabetes_mellitus_based_doctor_diagnosis_95%_CI_upper','weighted_n'],
    279: ['province','diabetes_type_1','diabetes_type_2','gestational_diabetes','unknown','weighted_n'],
    281: ['province','anti_dm_medication_by_medical_personel','anti_dm_medication_by_self_purchase','insulin_injection','combination_anti_dm_and_injection','no_need_dm_medication_yet','weighted_n'],
    283: ['province','receive_dm_treatment_education','receive_dm_treatment_education_95%_CI_lower','receive_dm_treatment_education_95%_CI_upper','adherence_dm_treatment','adherence_dm_treatment_95%_CI_lower','adherence_dm_treatment_95%_CI_upper','weighted_n'],
    286: ['province','diet_management','exercise','herbal_alternatives','weighted_n'],
    288: ['province','regular_control','occasional_control','no_control_visit','weighted_n'],
    291: ['province','frequent_hunger','frequent_thirst','frequent_unarination','weight_loss','weighted_n'],
    293: ['province','heart_disease_by_doctor_diagnosis','heart_disease_by_doctor_diagnosis_95%_CI_lower','heart_disease_by_doctor_diagnosis_95%_CI_upper','weighted_n'],
    301: ['province','hypertension_by_doctor_diagnosis','hypertension_by_doctor_diagnosis_95%_CI_lower','hypertension_by_doctor_diagnosis_95%_CI_upper','weighted_n_diagnosis','hypertension_by_measurement','hypertension_by_measurement_95%_CI_lower','hypertension_by_measurement_95%_CI_upper','weighted_n'],
    303: ['province','hypertension_by_doctor_diagnosis','hypertension_by_doctor_diagnosis_95%_CI_lower','hypertension_by_doctor_diagnosis_95%_CI_upper','weighted_n_diagnosis','hypertension_by_measurement','hypertension_by_measurement_95%_CI_lower','hypertension_by_measurement_95%_CI_upper','weighted_n'],
    305: ['province','receive_hypertension_treatment_and_education','taking_anti_hypertension_regularly','taking_anti_hypertension_irregularly','taking_no_medication_anti_hypertension','weighted_n'],
    307: ['province','feeling_weel','no_medicine','cannot_tolerate_side_effect','taking_traditional_medicine','bored_lazy_forgot','taken_only_during_pregnancy','others','weighted_n'],
    309: ['province','regularly_hypertension_control','occasionally_hypertension_control','disobeyed_hypertension_control','weighted_n'],
    311: ['province','controlled_hypertension','controlled_hypertension_95%_CI_lower','controlled_hypertension_95%_CI_upper','weighted_n'],
    314: ['province','stroke_based_doctor_diagnosis','stroke_based_doctor_diagnosis_95%_CI_lower','stroke_based_doctor_diagnosis_95%_CI_upper','weighted_n'],
    318: ['province','ckd_based_doctor_diagnosis','ckd_based_doctor_diagnosis_95%_CI_lower','ckd_based_doctor_diagnosis_95%_CI_upper','weighted_n'],
    324: ['province','disabilities','disabilities_95%_CI_lower','disabilities_95%_CI_upper','weighted_n'],
    327: ['province','vision_impairment','vision_impairment_95%_CI_lower','vision_impairment_95%_CI_upper','hearing_loss','hearing_loss_95%_CI_lower','hearing_loss_95%_CI_upper','gait_disorder','gait_disorder_95%_CI_lower','gait_disorder_95%_CI_upper','weighted_n'],
    329: ['province','visual_aids','visual_aids_95%_CI_lower','visual_aids_95%_CI_upper','weighted_n_visual_aids','hearing_aids','hearing_aids_95%_CI_lower','hearing_aids_95%_CI_upper','weighted_n_hearing_aids','walking_aids','walking_aids_95%_CI_lower','walking_aids_95%_CI_upper','weighted_n_walking_aids'],
    330: ['province','congenital_abnormalities','congenital_abnormalities_95%_CI_lower','congenital_abnormalities_95%_CI_upper','acident_injuries','acident_injuries_95%_CI_lower','acident_injuries_95%_CI_upper','disease','disease_95%_CI_lower','disease_95%_CI_upper','unknown','unknown_95%_CI_lower','unknown_95%_CI_upper','weighted_n'],
    331: ['province','stroke','diabetes','cancer','hypertension','cataract','others','weighted_n'],
    332: ['province','developmental_delays','autism','asperger_syndrome','add_adhd_gpph','celebral_palsy','dyslexia','conduct_disorder','mental_retardation','down_syndrome','weighted_n'],
    334: ['province','disability','disability_95%_CI_lower','disability_95%_CI_upper','weighted_n'],
    337: ['province','physical_disabilities','physical_disabilities_95%_CI_lower','physical_disabilities_95%_CI_upper','intelectual_disabilities','intelectual_disabilities_95%_CI_lower','intelectual_disabilities_95%_CI_upper','mental_disabilities','mental_disabilities_95%_CI_lower','mental_disabilities_95%_CI_upper','sensory_disabilities','sensory_disabilities_95%_CI_lower','sensory_disabilities_95%_CI_upper','communication_disabilities','communication_disabilities_95%_CI_lower','communication_disabilities_95%_CI_upper','weighted_n'],
    340: ['province','disability','disability_95%_CI_lower','disability_95%_CI_upper','weighted_n'],
    344: ['province','cognitive_domain','cognitive_domain_95%_CI_lower','cognitive_domain_95%_CI_upper','mobility_domain','mobility_domain_95%_CI_lower','mobility_domain_95%_CI_upper','self_care_domain','self_care_domain_95%_CI_lower','self_care_domain_95%_CI_upper','getting_along_domain','getting_along_domain_95%_CI_lower','getting_along_domain_95%_CI_upper','daily_activity_domain','daily_activity_domain_95%_CI_lower','daily_activity_domain_95%_CI_upper','participation_domain','participation_domain_95%_CI_lower','participation_domain_95%_CI_upper','weighted_n'],
    346: ['province','no_dificulty','no_dificulty_95%_CI_lower','no_dificulty_95%_CI_upper','mild_dificulty','mild_dificulty_95%_CI_lower','mild_dificulty_95%_CI_upper','moderate_dificulty','moderate_dificulty_95%_CI_lower','moderate_dificulty_95%_CI_upper','severe_dificulty','severe_dificulty_95%_CI_lower','severe_dificulty_95%_CI_upper','weighted_n'],
    349: ['province','community_health_center','hospital','private_health_center','clinic','drug_store','traditional_medicine','dont_use_health_facilities','weighted_n'],
    351: ['province','independent','independent_95%_CI_lower','independent_95%_CI_upper','mild_dependency','mild_dependency_95%_CI_lower','mild_dependency_95%_CI_upper','moderate_dependency','moderate_dependency_95%_CI_lower','moderate_dependency_95%_CI_upper','severe_dependency','severe_dependency_95%_CI_lower','severe_dependency_95%_CI_upper','total_dependency','total_dependency_95%_CI_lower','total_dependency_95%_CI_upper','weighted_n'],
    353: ['province','closest_family','closest_family_95%_CI_lower','closest_family_95%_CI_upper','domestic_helper','domestic_helper_95%_CI_lower','domestic_helper_95%_CI_upper','eldercare_worker','eldercare_worker_95%_CI_lower','eldercare_worker_95%_CI_upper','nurse','nurse_95%_CI_lower','nurse_95%_CI_upper','none','none_95%_CI_lower','none_95%_CI_upper','weighted_n'],
    357: ['province','toothache','toothache_95%_CI_lower','toothache_95%_CI_upper','missing_pulled_came_out_teeth','missing_pulled_came_out_teeth_95%_CI_lower','missing_pulled_came_out_teeth_95%_CI_upper','filed_teeth_due_cavities','filed_teeth_due_cavities_95%_CI_lower','filed_teeth_due_cavities_95%_CI_upper','loose_teeth','loose_teeth_95%_CI_lower','loose_teeth_95%_CI_upper','sensitive_teeth','sensitive_teeth_95%_CI_lower','sensitive_teeth_95%_CI_upper','weighted_n'],
    359: ['province','abscesses','abscesses_95%_CI_lower','abscesses_95%_CI_upper','bleeding_gums','bleeding_gums_95%_CI_lower','bleeding_gums_95%_CI_upper','canker_sores_recur_min4x','canker_sores_recur_min4x_95%_CI_lower','canker_sores_recur_min4x_95%_CI_upper','canker_sores_persist_dont_heal_atleast_1_month','canker_sores_persist_dont_heal_atleast_1_month_95%_CI_lower','canker_sores_persist_dont_heal_atleast_1_month_95%_CI_upper','weighted_n'],
    362: ['province','dental_oral_problems','dental_oral_problems_95%_CI_lower','dental_oral_problems_95%_CI_upper','weighted_n_problems','receive_treatment_from_dental_workers','receive_treatment_from_dental_workers_95%_CI_lower','receive_treatment_from_dental_workers_95%_CI_upper','received_treatment_from_dental_therapist','received_treatment_from_dental_therapist_95%_CI_lower','received_treatment_from_dental_therapist_95%_CI_upper','reveive_treatment_from_general_practioner','reveive_treatment_from_general_practioner_95%_CI_lower','reveive_treatment_from_general_practioner_95%_CI_upper','weighted_n_treatment'],
    365: ['province','never_seek_treatment_from_health_profesional','never_seek_treatment_from_health_profesional_95%_CI_lower','never_seek_treatment_from_health_profesional_95%_CI_upper','weighted_n_never_seek_treatment','reason_not_having_toothache','reason_not_having_toothache_95%_CI_lower','reason_not_having_toothache_95%_CI_upper','reason_no_urgency','reason_no_urgency_95%_CI_lower','reason_no_urgency_95%_CI_upper','reason_going_to_unlicensed_dentist','reason_going_to_unlicensed_dentist_95%_CI_lower','reason_going_to_unlicensed_dentist_95%_CI_upper','reason_self_treatment','reason_self_treatment_95%_CI_lower','reason_self_treatment_95%_CI_upper','weighted_n_reason'],
    368: ['province','taking_medicine','taking_medicine_95%_CI_lower','taking_medicine_95%_CI_upper','dental_hygine_and_health_counseling','dental_hygine_and_health_counseling_95%_CI_lower','dental_hygine_and_health_counseling_95%_CI_upper','tooth_filling','tooth_filling_95%_CI_lower','tooth_filling_95%_CI_upper','tooth_pulling','tooth_pulling_95%_CI_lower','tooth_pulling_95%_CI_upper','dentures_installation','dentures_installation_95%_CI_lower','dentures_installation_95%_CI_upper','scaling','scaling_95%_CI_lower','scaling_95%_CI_upper','weighted_n'],
    370: ['province','affected','affected_95%_CI_lower','affected_95%_CI_upper','unaffected','unaffected_95%_CI_lower','unaffected_95%_CI_upper','weighted_n'],
    373: ['province','not_brushing_teeth_daily','not_brushing_teeth_daily_95%_CI_lower','not_brushing_teeth_daily_95%_CI_upper','brushing_teeth_daily_1x','brushing_teeth_daily_1x_95%_CI_lower','brushing_teeth_daily_1x_95%_CI_upper','brushing_teeth_daily_2x','brushing_teeth_daily_2x_95%_CI_lower','brushing_teeth_daily_2x_95%_CI_upper','brush_teeth_in_correct_time','brush_teeth_in_correct_time_95%_CI_lower','brush_teeth_in_correct_time_95%_CI_upper','weighted_n'],
    375: ['province','owned_toothbrush_individually','owned_toothbrush_individually_95%_CI_lower','owned_toothbrush_individually_95%_CI_upper','owned_toothbrush_collectively','owned_toothbrush_collectively_95%_CI_lower','owned_toothbrush_collectively_95%_CI_upper','always_using_toothpaste','always_using_toothpaste_95%_CI_lower','always_using_toothpaste_95%_CI_upper','occasionally_using_toothpaste','occasionally_using_toothpaste_95%_CI_lower','occasionally_using_toothpaste_95%_CI_upper','weighted_n'],
    388: ['province','obtain_medicine_without_doctor_prescription','obtain_medicine_without_doctor_prescription_95%_CI_lower','obtain_medicine_without_doctor_prescription_95%_CI_upper','weighted_n_without_doctor_prescription','licensed_pharmacy','shop_or_supermarkets','gift_from_others','online_purchasing','others','weighted_n_source_medicine'],
    391: ['province','receive_med_information','receive_med_information_95%_CI_lower','receive_med_information_95%_CI_upper','weighted_n_receive_med_information','medicine_name_and_content','med_indication','med_dosage','med_instruction','side_effect','expiration_time','med_storage','weighted_n_types_information'],
    394: ['province','respondent_knows_med_clasification_etc','respondent_knows_med_clasification_etc_95%_CI_lower','respondent_knows_med_clasification_etc_95%_CI_upper','weighted_n'],
    397: ['province','using_antibiotic_oral_within_1_year','using_antibiotic_oral_within_1_year_95%_CI_lower','using_antibiotic_oral_within_1_year_95%_CI_upper','weighted_n_using_antibiotic_oral_within_1_year','obtain_antibiotic_with_doctor_prescription','obtain_antibiotic_without_doctor_prescription','weighted_n_obtain_antibiotic'],
    400: ['province','hospital_or_health_centers_or_independent_doctor_practices','independent_practices_of_healt_workers_but_not_doctor','licensed_pharmacy','shop','gifts_from_others','online_purchasing','weighted_n'],
    403: ['province','purchase_antibiotic_without_doctor_prescript_and_treat_symptoms_within_1_year','purchase_antibiotic_without_doctor_prescript_and_treat_symptoms_within_1_year_95%_CI_lower','purchase_antibiotic_without_doctor_prescript_and_treat_symptoms_within_1_year_95%_CI_upper','weighted_n'],
    406: ['province','antibiotic_must_finished','antibiotic_taken_acording_instruction','antibiotics_may_not_be_purchased_without_doctor_prescription','remaining_antibiotic_cant_be_reused','germs_become_resistant_if_not_used_accordingly','antibiotics_only_used_for_bacterial_infections','weighted_n'],
    409: ['province','med_breaks_has_holes','change_color_smell_taste','container_damaged','creams_become_murky','capsules_moist_mushy','illegible_torn_label','expired_medicine','weighted_n'],
    412: ['province','separating_medicine','destroy_medicine','throw_med_directly','burned_buried','entrusted_to_pharmacy','medicine_remains_stored','weighted_n'],
    415: ['province','yankestrad_utilization','yankestrad_utilization_95%_CI_lower','yankestrad_utilization_95%_CI_upper','weighted_n'],
    419: ['province','health_worker_in_hospital_or_clinic','independent_practice_of_health_personel','nakestrad_in_griya_sehat','traditional_healers','self_effort','weighted_n'],
    424: ['province','finished_concoction','homemade_concoction_yankestrad','homemade_concoction_self_made','skill_thought','skill_energy','manual_skill_massage','manual_skill_fractured_bone','manual_skill_acupunture','manual_skill_cupping','manual_skill_others','weighted_n'],
    426: ['province','toga_utilization','toga_utilization_95%_CI_lower','toga_utilization_95%_CI_upper','weighted_n'],
    430: ['province','correct_knowledge_of_stunted_children','correct_knowledge_of_stunted_children_95%_CI_lower','correct_knowledge_of_stunted_children_95%_CI_upper','weighted_n'],
    432: ['province','doesnt_gain_weight','failure_to_grow','dwarfism','malnutrition_or_wasting','short_stunted','prolonged_malnutrition','under_weight','chronic_lack_of_energy','anemia','weighted_n'],
    434: ['province','health_workers','health_workers_95%_CI_lower','health_workers_95%_CI_upper','family_parent_friends','family_parent_friends_95%_CI_lower','family_parent_friends_95%_CI_upper','mass_media','mass_media_95%_CI_lower','mass_media_95%_CI_upper','others','others_95%_CI_lower','others_95%_CI_upper','weighted_n'],
    436: ['province','less_nutrition_intake','insufficient_maternal_nutritional_during_pregnancy','lack_of_sanitation','children_often_suffer_from_illness','poverty','hereditary_disease','weighted_n'],
    438: ['province','risk_of_suffering_from_NCD','decrease_level_of_intelligence','stunted_physical_growth','hampered_brain_development','low_productivity_level','has_no_impact','weighted_n'],
    440: ['province','children_are_given_exclusive_breast_milk','children_given_breast_milk_for_2_years','children_get_mpasi','immunize_babies','monitor_weight_height_gain_each_month','pregnant_women_drink_blood_increasement_tablet','pregnant_women_regulary_check_pregnancy_6_times','pregnant_women_consume_animal_protein','weighted_n'],
    444: ['province','llins_mosquito_nets_usage','llins_mosquito_nets_usage_95%_CI_lower','llins_mosquito_nets_usage_95%_CI_upper','weighted_n_llins_mosquito_nets_usage','llins_musquito_nets_usage_on_child_under_5','llins_musquito_nets_usage_on_child_under_5_95%_CI_lower','llins_musquito_nets_usage_on_child_under_5_95%_CI_upper','weighted_n_llins_musquito_nets_usage_on_child_under_5'],
    450: ['province','sleep_using_musquito_net_without_insecticide','sleep_using_long_lasting_insecticide_nets_less_than_3_years','sleep_using_long_lasting_insecticide_nets_more_than_3_years','using_repellents_prevent_mosquito_bites','using_mosquito_repellent','weighted_n'],
    454: ['province','proper_defecation_behavior','proper_defecation_behavior_95%_CI_lower','proper_defecation_behavior_95%_CI_upper','weighted_n'],
    457: ['province','proper_hand_washing_behavior','proper_hand_washing_behavior_95%_CI_lower','proper_hand_washing_behavior_95%_CI_upper','weighted_n'],
    461: ['province','daily_smoker','daily_smoker_95%_CI_lower','daily_smoker_95%_CI_upper','occasional_smoker','occasional_smoker_95%_CI_lower','occasional_smoker_95%_CI_upper','ex_smoker','ex_smoker_95%_CI_lower','ex_smoker_95%_CI_upper','non_smoker','non_smoker_95%_CI_lower','non_smoker_95%_CI_upper','weighted_n'],
    463: ['province','daily_smoker','daily_smoker_95%_CI_lower','daily_smoker_95%_CI_upper','occasional_smoker','occasional_smoker_95%_CI_lower','occasional_smoker_95%_CI_upper','ex_smoker','ex_smoker_95%_CI_lower','ex_smoker_95%_CI_upper','non_smoker','non_smoker_95%_CI_lower','non_smoker_95%_CI_upper','weighted_n'],
    465: ['province','avg_num_cigarettes_per_day','SD_num_cigarettes_per_day','weighted_n_cigarettes_per_day','avg_num_cigarettes_per_week','SD_num_cigarettes_per_week','weighted_n_cigarettes_per_week'],
    467: ['province','avg_cigarette_price','SD_cigarette_price','weighted_n'],
    469: ['province','age_4_to_9','age_4_to_9_95%_CI_lower','age_4_to_9_95%_CI_upper','age_10_to_14','age_10_to_14_95%_CI_lower','age_10_to_14_95%_CI_upper','age_15_to_19','age_15_to_19_95%_CI_lower','age_15_to_19_95%_CI_upper','age_20_to_24','age_20_to_24_95%_CI_lower','age_20_to_24_95%_CI_upper','age_25_to_29','age_25_to_29_95%_CI_lower','age_25_to_29_95%_CI_upper','age_more_then_30','age_more_then_30_95%_CI_lower','age_more_then_30_95%_CI_upper','weighted_n'],
    471: ['province','age_4_to_9','age_4_to_9_95%_CI_lower','age_4_to_9_95%_CI_upper','age_10_to_14','age_10_to_14_95%_CI_lower','age_10_to_14_95%_CI_upper','age_15_to_19','age_15_to_19_95%_CI_lower','age_15_to_19_95%_CI_upper','age_20_to_24','age_20_to_24_95%_CI_lower','age_20_to_24_95%_CI_upper','age_25_to_29','age_25_to_29_95%_CI_lower','age_25_to_29_95%_CI_upper','age_more_then_30','age_more_then_30_95%_CI_lower','age_more_then_30_95%_CI_upper','weighted_n'],
    473: ['province','kretek','kretek_95%_CI_lower','kretek_95%_CI_upper','white','white_95%_CI_lower','white_95%_CI_upper','hand_rolled','hand_rolled_95%_CI_lower','hand_rolled_95%_CI_upper','electronic','electronic_95%_CI_lower','electronic_95%_CI_upper','shisa','shisa_95%_CI_lower','shisa_95%_CI_upper','weighted_n'],
    475: ['province','smooking_in_buildings','smooking_in_buildings_95%_CI_lower','smooking_in_buildings_95%_CI_upper','weighted_n'],
    477: ['province','smoking_in_closed_room_every_day','smoking_in_closed_room_every_day_95%_CI_lower','smoking_in_closed_room_every_day_95%_CI_upper','smoking_in_closed_room_occasionally','smoking_in_closed_room_occasionally_95%_CI_lower','smoking_in_closed_room_occasionally_95%_CI_upper','smoking_in_closed_room_never','smoking_in_closed_room_never_95%_CI_lower','smoking_in_closed_room_never_95%_CI_upper','weighted_n'],
    479: ['province','chewing_tobaco_every_day','chewing_tobaco_every_day_95%_CI_lower','chewing_tobaco_every_day_95%_CI_upper','chewing_tobaco_occasionally','chewing_tobaco_occasionally_95%_CI_lower','chewing_tobaco_occasionally_95%_CI_upper','former_chewing_tobaco','former_chewing_tobaco_95%_CI_lower','former_chewing_tobaco_95%_CI_upper','never_chewing_tobaco','never_chewing_tobaco_95%_CI_lower','never_chewing_tobaco_95%_CI_upper','weighted_n'],
    482: ['province','sweet_consumption_more_than_1_times_per_day','sweet_consumption_1_to_6_times_per_week','sweet_consumption_less_than_3_times_per_month','weighted_n'],
    484: ['province','sweet_consumption_more_than_1_times_per_day','sweet_consumption_1_to_6_times_per_week','sweet_consumption_less_than_3_times_per_month','weighted_n'],
    486: ['province','salty_consumption_more_than_1_times_per_day','salty_consumption_1_to_6_times_per_week','salty_consumption_less_than_3_times_per_month','weighted_n'],
    488: ['province','fatty_consumption_more_than_1_times_per_day','fatty_consumption_1_to_6_times_per_week','fatty_consumption_less_than_3_times_per_month','weighted_n'],
    490: ['province','roasted_food_consumption_more_than_1_times_per_day','roasted_food_consumption_1_to_6_times_per_week','roasted_food_consumption_less_than_3_times_per_month','weighted_n'],
    492: ['province','processed_food_consumption_more_than_1_times_per_day','processed_food_consumption_1_to_6_times_per_week','processed_food_consumption_less_than_3_times_per_month','weighted_n'],
    494: ['province','flavor_enhancer_consumption_more_than_1_times_per_day','flavor_enhancer_consumption_1_to_6_times_per_week','flavor_enhancer_consumption_less_than_3_times_per_month','weighted_n'],
    496: ['province','soft_drink_consumption_more_than_1_times_per_day','soft_drink_consumption_1_to_6_times_per_week','soft_drink_consumption_less_than_3_times_per_month','weighted_n'],
    498: ['province','energy_drink_consumption_more_than_1_times_per_day','energy_drink_consumption_1_to_6_times_per_week','energy_drink_consumption_less_than_3_times_per_month','weighted_n'],
    500: ['province','instant_noodle_consumption_more_than_1_times_per_day','instant_noodle_consumption_1_to_6_times_per_week','instant_noodle_consumption_less_than_3_times_per_month','weighted_n'],


}

### Extract File

file_names = [
        os.path.splitext(f)[0]
    for f in os.listdir('./datasets')
    if os.path.isfile(os.path.join('./datasets', f)) and f.endswith('.csv')
]

for page_file,column_name in province_dict.items():
    file_name_result = extract_filename(page_file=page_file)
    if file_name_result not in  file_names:
        extract_province(save_filename=file_name_result, page_file=page_file, column_name=column_name)
        print(page_file,' ',file_name_result)


486   Table 11.51 Proportion of Salty Food Consumption Habit in Population Aged ≥3 by Province, 2023 SKI
488   Table 11.53 Proportion of Fatty or Cholesterol or Fried Food Consumption Habits in Population Aged ≥ 3 by Province, 2023 SKI
490   Table 11.55 Proportion of Roasted Food Consumption Habit in Population Aged ≥ 3 by Province, 2023 SKI
492   Table 11.57 Proportions of Consumption Habit of Processed Meat or Chicken or Fish with Preservatives among Population Aged ≥ 3 by Province, 2023 SKI
494   Table 11.59 Proportion of Flavor Enhancer Consumption Habits in Population Aged ≥ 3 by Province, 2023 SKI
496   Table 11.61 Proportion of Soft Drink or Carbonated Drink Consumption Habit in Population Aged ≥ 3 by Province, 2023 SKI
498   Table 11.63 Proportion of Energy Drink Consumption Habits in Population Aged ≥ 3 by Province, 2023 SKI
500   Table 11.65 Proportion of Instant Noodle or Other Instant Food Consumption Habit in Population Aged ≥ 3 by Province, 2023 SKI


In [None]:
df = pd.DataFrame({'a':[1,2], 'b':[3,4]})
df.to_csv('./datasets/Table 10.5 Proportion of Respondents who knew the Medicines Classification Purchased, Including Over the Counter Medicines, Limited Over the Counter Medicines, or Potent Medicines by Province, 2023 SKI.csv')

In [13]:
len('./datasets/Table 10.5 Proportion of Respondents who knew the Medicines Classification Purchased, Including Over-the-Counter Medicines, Limited Over-the-Counter Medicines, or Potent Medicines by Province, 2023 SKI.csv')

216

In [None]:
test = (extract_filename(page_file=365))
print(test)
show_province(page_file=362,column_name=province_dict[362])
# extract_province(save_filename=test, page_file=362, column_name=province_dict[362])

In [6]:
page_file = 469
column_name = ['province','toothache','toothache_95%_CI_lower','toothache_95%_CI_upper','missing_pulled_came_out_teeth','missing_pulled_came_out_teeth_95%_CI_lower','missing_pulled_came_out_teeth_95%_CI_upper','filed_teeth_due_cavities','filed_teeth_due_cavities_95%_CI_lower','filed_teeth_due_cavities_95%_CI_upper','loose_teeth','loose_teeth_95%_CI_lower','loose_teeth_95%_CI_upper','sensitive_teeth','sensitive_teeth_95%_CI_lower','sensitive_teeth_95%_CI_upper','weighted_n']

page = reader.pages[page_file].extract_text()
# Initialize variables
start_found = False
lines = []

page_line = page.splitlines()

# Read the file line by line

for line in page_line:
    # Check for the start of the section
    if "Aceh" in line:
        start_found = True
    # Collect lines if within the range
    if start_found:
        lines.append(line.strip())
    # Stop when reaching the end
    if "INDONESIA" in line:
        break


lines = [re.sub(r'(\d),(?=\s|\D)', r'\1', line) for line in lines]
lines = [re.sub(r'\s{2,}', ' ', line) for line in lines]
lines = [re.sub(r'(?<!\s)-|-(?!\s)', ' - ', line) for line in lines]
lines = [re.sub(r',\s*-', ' -', text) for text in lines]
lines = [re.sub(r"(\d+,\d)\d*", r"\1", text) for text in lines]
lines = [re.sub(r"(?<=\d,\d),\d*", "", text) for text in lines]


## cleaned lines for specific case
clean_lines = []
for line in lines:
    if re.findall(r'\b\d{1,3},\d{1,3},\d{1,3}\b', line):
        # Replace commas correctly between the second and third number group
        fixed_line = re.sub(r'(\d{1,3},\d{1,1})(\d{1,3},\d{1,3})', r'\1 - \2', line)
        clean_lines.append(fixed_line)
    elif next((prov for prov in province_list if re.match(r'^\b' + re.escape(prov) + r'\b', line) and len(prov.split()) >= 1), None)=='DKI Jakarta' and page_file == 116:
        fixed_line = re.sub(r'0,0 - 2,0', '0,0 0,0 - 0,0 2,0', line)
        clean_lines.append(fixed_line)
    else:
        clean_lines.append(line)    

data=[]
for line in clean_lines:

    # Extract province name by matching against province_list
    province = next((prov for prov in province_list if line.startswith(prov)), None)
    if province:
        # Remove the province name from the start of the line
        line_without_province = line[len(province):].strip()
        rest = [re.sub(r'[^a-zA-Z0-9\s.,]', '', val) for val in line_without_province.split()]
        rest_cleaned =  [item for item in rest if item!='']

        # Combine province name with the rest of the data
        data.append([province] + rest_cleaned)

pd.DataFrame(data)
# df = pd.DataFrame(data,columns = column_name)
# # for i in column_name:
# #     if df[i].str.contains(r",", regex=True).sum() > 0:
# #         try:
# #             df[i] = df[i].replace("NA", np.nan).str.replace(",",".").astype(float)
# #         except ValueError as e :
# #             print(f"Error in column: {i} - {e}")

# ## exceptional pages difference
# if page_file == 248:
#     for i in column_name[1:len(column_name)-1]:
#         df[i] =  df[i].str.replace('.',',')

# for i in column_name:
#     if i == 'province':
#         df[i] = df[i].astype(str)
#     elif df[i].str.contains(r",", regex=True).sum() > 0:
#         df[i] = df[i].replace("NA", np.nan).str.replace(",",".").astype(float)
#     else:
#         df[i] = df[i].str.replace(".","").astype(int)
# df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,Aceh,5,3,8,133,114,154,646,621,669,180,163,198,27,22,34,9,6,15,1.91
1,Sumatera Utara,8,6,12,124,111,138,617,596,639,201,183,220,31,25,37,19,15,25,5.996
2,Sumatera Barat,25,21,30,280,261,300,548,528,567,108,97,121,24,20,30,15,11,19,3.129
3,Riau,9,6,15,173,155,192,638,615,660,142,127,159,22,18,28,15,11,21,2.451
4,Jambi,7,4,13,159,134,188,617,587,646,176,155,199,28,21,38,13,9,20,1.064
5,Sumatera Selatan,13,9,17,179,164,195,619,600,638,149,135,164,27,22,33,14,11,18,3.784
6,Bengkulu,14,10,21,194,171,219,627,602,652,115,101,131,33,26,42,16,12,22,1.011
7,Lampung,11,8,16,147,130,165,621,599,642,176,160,193,29,23,38,16,12,21,4.361
8,Bangka Belitung,11,7,17,180,158,203,607,580,633,163,144,185,21,15,29,19,14,26,647.0
9,Kepulauan Riau,11,7,19,161,136,190,578,528,627,196,162,235,33,23,48,21,13,32,916.0


In [8]:
save_filename = 'Table 10. 3 Proportion of Respondents who Received Medicine Information from Health Workers and Types of Medicine Information Obtained by Province, 2023 SKI'
save_filename = re.sub(r'/', ' or ', save_filename)
save_filename = re.sub(r'(\bTable \d+\.\d+)\.', r'\1', save_filename)
save_filename = re.sub(r'&', 'and', save_filename)
save_filename = re.sub(r'(Table \d\.)\s', r'\1', save_filename)
save_filename = re.sub(r'>', 'more than', save_filename)
save_filename = re.sub(r'<', 'less than', save_filename)
save_filename = re.sub(r'Table (\d+)\.\s*(\d+)', r'Table \1.\2', save_filename)
print(save_filename)

Table 10.3 Proportion of Respondents who Received Medicine Information from Health Workers and Types of Medicine Information Obtained by Province, 2023 SKI


In [55]:
page_file = 55
column_name = ['province','no_access','no_access_95%_CI_lower','no_access_95%_CI_upper',
         'inadequate_access','inadequate_access_95%_CI_lower','inadequate_access_95%_CI_upper',
         'adequate_limited_access','adequate_limited_access_95%_CI_lower','adequate_limited_access_95%_CI_upper',
         'adequate_basic_access','adequate_basic_access_95%_CI_lower','adequate_basic_access_95%_CI_upper','weighted_n']
pct_column = ['no_access','no_access_95%_CI_lower','no_access_95%_CI_upper',
         'inadequate_access','inadequate_access_95%_CI_lower','inadequate_access_95%_CI_upper',
         'adequate_limited_access','adequate_limited_access_95%_CI_lower','adequate_limited_access_95%_CI_upper',
         'adequate_basic_access','adequate_basic_access_95%_CI_lower','adequate_basic_access_95%_CI_upper']
page_title = reader_english.pages[page_file-1].extract_text()

#### Title file
page_title = reader_english.pages[page_file-1].extract_text()

# Initialize variables
start_title = False
list_title = []

split_title = page_title.splitlines()

# for i, value in enumerate(split_title):
#         if "Table" in value:
#             # Return the list starting from that index
#             split_title =  split_title[i:]
#             break

split_title
# for line in split_title:
#         # Check for the start of the section
#         if "Table" in line:
#             start_title = True
#         # Collect lines if within the range
#         if start_title:
#             list_title.append(line.strip())
#         # Stop when reaching the end
#         if "SKI" in line:
#             break
# list_title
# list_title_clean = []
# for line in list_title:
#     # Check for the start of the section
#     if "Table" in line:
#         start_title = True
#         print(line)
#     # Collect lines if within the range
#     if start_title:
#         list_title_clean.append(line.strip())
#     if "SKI" in line:
#         break
# split_title



# save_filename = " ".join(filter(None, list_title))
# save_filename = re.sub(r'/', ' or ', save_filename)
# save_filename = re.sub(r'(\bTable \d+\.\d+)\.', r'\1', save_filename)
# save_filename = re.sub(r'&', 'and', save_filename)
# save_filename

['2023 Indonesian Health Survey | 17  ',
 'ECONOMIC STATUS PROFILE ',
 ' ',
 'Table 2.5. Profile of the Economic Status Distribution by Province, 2023 SKI ',
 ' ',
 '  Economic Status (%)  ',
 'Province Lowest ',
 '(Q1) ',
 'Lower ',
 'Middle (Q2) ',
 'Middle ',
 '(Q3) ',
 'Upper Middle ',
 '(Q4) ',
 'Highest ',
 '(Q5) ',
 'Aceh 18.0 23.0 20.5 19.7 18.9 ',
 'North Sumatra 29.5 19.8 19.8 15.8 15.1 ',
 'West Sumatra 14.5 22.4 17.6 23.8 21.8 ',
 'Riau 9.2 18.4 26.6 23.4 22.4 ',
 'Jambi 11.1 21.5 18.3 24.1 25.0 ',
 'South Sumatra 20.9 22.9 20.9 17.1 18.1 ',
 'Bengkulu 15.1 20.5 20.5 27.2 16.8 ',
 'Lampung 20.7 27.0 21.3 16.7 14.3 ',
 'Bangka Belitung Islands 4.4 9.8 19.7 27.6 38.5 ',
 'Riau Islands 10.3 13.4 15.7 26.4 34.1 ',
 'DKI Jakarta 1.2 6.2 16.7 34.1 41.9 ',
 'West Java 9.7 14.5 22.5 28.1 25.3 ',
 'Central Java 16.3 21.7 22.9 20.6 18.6 ',
 'DI Yogyakarta 12.0 19.9 15.2 25.7 27.2 ',
 'East Java 15.9 20.3 23.3 21.8 18.7 ',
 'Banten 8.1 11.7 20.6 26.9 32.8 ',
 'Bali 6.7 16.7 21.7 27.3 

In [61]:
# Sample text
lines = [
    "Kepulauan Riau 0,4 0,1 - 2,1 7,5 6,1 - 9,2 2,0 1,1 - 3,6 90,1 87,792,1 2.514",
    "Another line without matching pattern",
    "Line with 34,539,2 and more"
]

clean_lines = []
for line in lines:
    # Check if line contains any matching number group
    if re.findall(r'\b\d{1,3},\d{1,3},\d{1,3}\b', line):
        # Regex to split the last two parts and insert the dash
        fixed_line = re.sub(r'(\d{1,3},\d{1,1})(\d{1,3},\d{1,3})', lambda m: m.group(1) + ' - ' + m.group(2), line)
        clean_lines.append(fixed_line)
        print("Original line:", line)
        print("Fixed line:", fixed_line)
    else:
        clean_lines.append(line)

# To print the final cleaned lines
print("\nCleaned lines:")
for line in clean_lines:
    print(line)

Original line: Kepulauan Riau 0,4 0,1 - 2,1 7,5 6,1 - 9,2 2,0 1,1 - 3,6 90,1 87,792,1 2.514
Fixed line: Kepulauan Riau 0,4 0,1 - 2,1 7,5 6,1 - 9,2 2,0 1,1 - 3,6 90,1 87,7 - 92,1 2.514
Original line: Line with 34,539,2 and more
Fixed line: Line with 34,5 - 39,2 and more

Cleaned lines:
Kepulauan Riau 0,4 0,1 - 2,1 7,5 6,1 - 9,2 2,0 1,1 - 3,6 90,1 87,7 - 92,1 2.514
Another line without matching pattern
Line with 34,5 - 39,2 and more


In [173]:
save_filename = 'Table 2.5. Profile of the Economic Status Distribution by Province, 2023 SKI'
page_file = 65
column_name = ['province','easy_access','easy_access_95%_CI_lower','easy_access_95%_CI_upper',
         'difficult_access','difficult_access_95%_CI_lower','difficult_access_95%_CI_upper',
         'very_difficult_access','very_difficult_access_95%_CI_lower','very_difficult_access_95%_CI_upper','weighted_n']
pct_column = ['easy_access','easy_access_95%_CI_lower','easy_access_95%_CI_upper',
         'difficult_access','difficult_access_95%_CI_lower','difficult_access_95%_CI_upper',
         'very_difficult_access','very_difficult_access_95%_CI_lower','very_difficult_access_95%_CI_upper']

show_province(save_filename=save_filename, page_file=page_file, column_name=column_name, pct_column=pct_column)


Unnamed: 0,province,easy_access,easy_access_95%_CI_lower,easy_access_95%_CI_upper,difficult_access,difficult_access_95%_CI_lower,difficult_access_95%_CI_upper,very_difficult_access,very_difficult_access_95%_CI_lower,very_difficult_access_95%_CI_upper,weighted_n
0,Aceh,27.0,25.0,29.2,48.4,46.1,50.7,24.5,22.5,26.7,4653
1,Sumatera Utara,36.0,33.7,38.3,47.1,44.6,49.5,16.9,15.5,18.4,13087
2,Sumatera Barat,44.2,41.5,46.9,43.0,40.5,45.5,12.8,11.5,14.2,4883
3,Riau,36.6,33.9,39.4,45.9,43.2,48.6,17.5,15.4,19.9,6124
4,Jambi,35.2,32.5,38.0,46.7,44.0,49.5,18.1,16.1,20.3,3440
5,Sumatera Selatan,42.9,40.6,45.2,42.3,40.1,44.6,14.8,13.2,16.6,7928
6,Bengkulu,41.4,38.0,44.8,42.7,39.6,45.9,15.9,13.6,18.5,1950
7,Lampung,35.7,32.9,38.5,50.3,47.6,53.0,14.0,12.3,16.0,9044
8,Bangka Belitung,33.7,30.6,36.9,48.9,45.8,52.0,17.4,14.8,20.4,1409
9,Kepulauan Riau,35.4,30.9,40.3,50.3,46.0,54.6,14.2,11.9,17.0,2079


In [68]:
page_file = 49
column_name = [
    'province','census_block_target','census_block_visited','census_block_response_rate',
    'household_target','household_visited','household_response_rate','household_member_visited',
    'household_member_interview_measurement'
]
pct_column = ['census_block_response_rate','household_response_rate']
save_filename = 'Table 2.1. Community Health Data Response Rate (Interview & Measurement) by Province, 2023 SKI'

extract_province(save_filename=save_filename,page_file=page_file,column_name=column_name,pct_column=pct_column)