In [None]:
import json
from typing import Dict, Set

import pandas as pd
import yaml
from IPython.display import display

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

psg_directory = "../resources/"
psg_data_file = "psgc_2026-01-13.csv"

In [None]:
df = pd.read_csv(psg_directory + psg_data_file)
display(df.info())
display(df)

In [None]:
df["psgc_id"] = df["psgc_id"].astype(str).str.zfill(10)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
geographic_level_map = {
    "Reg": "region",
    "City": "city",
    "Mun": "municipality",
    "Prov": "province",
    "SubMun": "submunicipality",
    "Bgy": "barangay",
}
df["geographic_level"] = df["geographic_level"].replace(geographic_level_map)

df["barangay_code"] = df["psgc_id"].str[-3:]
df["municipal_or_city_code"] = df["psgc_id"].str[-5:-3]
df["province_or_huc_code"] = df["psgc_id"].str[-8:-5]
df["region_code"] = df["psgc_id"].str[-10:-8]

df["barangay_mapper"] = df["psgc_id"].str[-10:]
df["municipal_or_city_mapper"] = df["psgc_id"].str[-10:-3]
df["province_or_huc_mapper"] = df["psgc_id"].str[-10:-5]
df["region_mapper"] = df["psgc_id"].str[-10:-8]

df.sample(10)

In [None]:
regions_filter = (
    (df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
regions_mapper = (
    df.loc[regions_filter, ["region_mapper", "name"]]
    .sort_values("region_mapper")
    .set_index("region_mapper", drop=True)
    .to_dict()["name"]
)
regions_mapper

In [None]:
province_or_huc_filter = (
    ~(df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

province_or_huc_mapper = (
    df.loc[province_or_huc_filter, ["province_or_huc_mapper", "name"]]
    .sort_values("province_or_huc_mapper")
    .set_index("province_or_huc_mapper")
    .to_dict()["name"]
)
province_or_huc_mapper

In [None]:
municipal_or_city_filter = (
    ~(df["province_or_huc_code"] == "000")
    & ~(df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

municipal_or_city_mapper = (
    df.loc[municipal_or_city_filter, ["municipal_or_city_mapper", "name"]]
    .sort_values("municipal_or_city_mapper")
    .set_index("municipal_or_city_mapper")
    .to_dict()["name"]
)
municipal_or_city_mapper

In [None]:
df["region"] = df["region_mapper"].map(regions_mapper)
df["province_or_huc"] = df["province_or_huc_mapper"].map(province_or_huc_mapper)
df["municipality_or_city"] = df["municipal_or_city_mapper"].map(
    municipal_or_city_mapper
)

In [None]:
barangay_df = df[df["geographic_level"] == "barangay"].reset_index(drop=True)

In [None]:
# building dictionary
empty_municipality = barangay_df["municipality_or_city"].isna()
empty_province_or_huc = barangay_df["province_or_huc"].isna()

In [None]:
mdf = barangay_df[~empty_municipality & ~empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name",
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

empty_municipality_df = barangay_df[empty_municipality & ~empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name",
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

empty_province_df = barangay_df[~empty_municipality & empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name",
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

In [None]:
root_dict: Dict[str, Dict[str, Set[str] | Dict[str, Set]]] = {}
for idx, (i, j, k, l) in mdf[
    ["region", "province_or_huc", "municipality_or_city", "name"]
].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = {}
    if k not in root_dict[i][j].keys():
        root_dict[i][j][k] = set()
    root_dict[i][j][k].add(l)

# handling empty municipality
for idx, (i, j, k) in empty_municipality_df[
    ["region", "province_or_huc", "name"]
].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = set()
    root_dict[i][j].add(k)

# handling empty prov
for idx, (i, j, k) in empty_province_df[
    ["region", "municipality_or_city", "name"]
].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = set()
    root_dict[i][j].add(k)

In [None]:
json_str = json.dumps(
    root_dict, default=lambda o: list(o) if isinstance(o, set) else o, indent=4
)
json_dict = json.loads(json_str)
yaml_str = yaml.safe_dump(json_dict)

In [None]:
with open("../barangay/data/barangay.json", encoding="utf8", mode="w") as file:
    file.write(json_str)

In [None]:
with open("../barangay/data/barangay.yaml", encoding="utf8", mode="w") as file:
    file.write(yaml_str)