This notebook tries to find commune codes and names of CFs as they are represented in DB23 (`ncdd_admin_database_25provinces__2023.xlsx` from Lok on 20231212).

The focus is on the 36 CFs not programatically matched in `explore_DB23.ipynb` or manually matched. A GIS-assisted matching is used.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely import wkt
from pathlib import Path
from itables import show

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils.match_commune_CF_vs_DB23 import *

In [4]:
# paths
code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'
intmd_outfd_path = code_path.parent.parent / 'output_intmd' / 'match_CF_DB23'

## Read in data

### CF data: 74 CFs whose previously matched CDB communes (if any) fail to find it a DB23 commune [for ChatGPT, aborted]
CF communes and districts names matched to CDB ones

a list of CDB villages in the commune each CF belongs to

and more

In [None]:
CF_vlst_df = pd.read_csv(intmd_outfd_path / 'CF_w_match_helper_vill_tomatch.csv')

### CF data: all CFs, incl. 36/74 CFs whose communes yet to be found in DB23 after auto and manual matching

to be matched in GIS-assisted matching section

In [5]:
CF_df = pd.read_csv(intmd_outfd_path / 'CF_w_match_helper_vill_DB23_comm_autoJoined_matched.csv')

#### df to gdf

In [6]:
# Convert the 'geometry' column from WKT to geometrical objects
CF_df['geometry'] = CF_df['geometry'].apply(wkt.loads)

# Create a GeoDataFrame
CF_gdf = gpd.GeoDataFrame(CF_df, geometry='geometry')

# Set the coordinate reference system (CRS) to EPSG:3148
CF_gdf = CF_gdf.set_crs(epsg=3148)

### Admin shapefiles associated with CDB socio-economic data, Lok sent 20230407

In [7]:
CDB_shp_fd_path = datafd_path / 'boundaries' / 'Cambodia_Admin-2015'

#### village points

In [None]:
# v_pt_gdf = gpd.read_file(CDB_shp_fd_path / 'Villages.shp')  # EPSG:32648

#### commune boundaries

In [8]:
c_bnd_gdf = gpd.read_file(CDB_shp_fd_path / 'Commune Boundary.shp')  # EPSG:32648

In [9]:
c_gdf = c_bnd_gdf.to_crs(CF_gdf.crs)

#### district boundaries

In [None]:
# d_bnd_gdf = gpd.read_file(CDB_shp_fd_path / 'District Boundary.shp')  # EPSG:32648

#### province boundaries

In [None]:
# p_bnd_gdf = gpd.read_file(CDB_shp_fd_path / 'Province Boundary.shp')  # EPSG:32648

## Initial processing of DB23

### Read in DB23 and stack all sheets vertically into `db_df`

In [10]:
DB23_path = datafd_path / 'other' / 'ncdd_admin_database_25provinces__2023.xlsx'
DB23_df_dic = pd.read_excel(DB23_path, sheet_name=None, header=2)  
# all sheets, key=sheet name, skip first 2 rows, make row 3 header

In [11]:
sht_name_lst = list(DB23_df_dic.keys())
db_prov_df_lst = [DB23_df_dic[sht] for sht in sht_name_lst]

In [12]:
db_df = pd.concat(db_prov_df_lst)

## GIS-assisted matching

### Extract 36 CFs whose communes to be matched

In [13]:
CF36_gdf = CF_gdf[CF_gdf.Code_Comm_DB23.isna()]  # 36 to be matched

### Find their communes based on spatial join with commune boundaries

In [14]:
CF36_c_gdf = gpd.sjoin(
    CF36_gdf, 
    c_gdf[['COMM_CODE', 'COMM_NAME', 'geometry']], 
    how="left", 
    predicate='intersects'
)

In [15]:
CF36_c_gdf.rename({
    'index_right': 'index_shp', 
    'COMM_CODE': 'COMM_CODE_shp',
    'COMM_NAME': 'COMM_NAME_shp'
}, inplace=True)

In [16]:
CF36_c_gdf.shape  # some CFs intersect with multiple communes

(72, 58)

### Get DB23 communes from commune boundaries spatially joined (Join codes from commune boundaries to codes in DB23)

In [17]:
CF36_c_gdf.COMM_CODE = CF36_c_gdf.COMM_CODE.astype(int)

In [18]:
CF36_w_DB23 = CF36_c_gdf.merge(
    db_df[['Code', 'Name (Latin)']],
    how='left',
    left_on='COMM_CODE',
    right_on='Code',
    indicator=True,
    validate='many_to_one'
)    

In [19]:
CF36_w_DB23._merge.value_counts()

both          72
left_only      0
right_only     0
Name: _merge, dtype: int64

In [20]:
# Confirm all names are the same / similar
CF36_w_DB23.loc[
    CF36_w_DB23.COMM_NAME != CF36_w_DB23['Name (Latin)'],
    ['COMM_NAME', 'Name (Latin)']
]

Unnamed: 0,COMM_NAME,Name (Latin)
14,Srae Sangkom,Srae Sangkum
16,Bu Chri,Pu Chrey
18,Ou Buon Leu,A Buon Leu
19,Bu Chri,Pu Chrey
20,Srae Sangkom,Srae Sangkum
22,Nang Khi Loek,Nang Khi Lik
24,Srae Sangkom,Srae Sangkum
33,Pate,Pa Te


In [21]:
CF36_w_DB23.head(2)

Unnamed: 0.1,Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,...,Code_Comm_DB23,Commun_DB23,Commun_DB23_Note,Commune_DB23_cln,index_right,COMM_CODE,COMM_NAME,Code,Name (Latin),_merge
0,14,0,0,0,0,Phnom Phneas,PñMPñas,,,Steung,...,,,"not matched based on CommGis, not matched by h...",,398,20107,Snoeng,20107,Snoeng,both
1,14,0,0,0,0,Phnom Phneas,PñMPñas,,,Steung,...,,,"not matched based on CommGis, not matched by h...",,371,20701,Sdau,20701,Sdau,both


### Fit CFs that intersect with multiple communes into one row

In [22]:
# Aggregate communes per CF
CF36ID_w_DB23CommLst = CF36_w_DB23.groupby(['UniqueID']).agg(
    {
        var: lambda x: ', '.join(x.astype(str)) for var in ['Code', 'Name (Latin)']
    }  
)
CF36ID_w_DB23CommLst.head(2)

Unnamed: 0_level_0,Code,Name (Latin)
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1
262,"160607, 160602, 160104, 160801","L'ak, Pouy, Ta Lav, Ta Veaeng Leu"
269,"160605, 160604, 160602, 160904, 160906","Ou Chum, Kalai, Pouy, Ka Choun, Kaoh Peak"


In [23]:
# Enrich the 36 CFs with the concatenated village names and codes
CF36_w_DB23CommLst = CF36_gdf.merge(
    CF36ID_w_DB23CommLst,
    how='inner',
    on='UniqueID',
    validate='1:1'
)

In [24]:
CF36_w_DB23CommLst

Unnamed: 0.1,Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,...,Commune_cln,Village,VillGis,CommGis,Code_Comm_DB23,Commun_DB23,Commun_DB23_Note,Commune_DB23_cln,Code,Name (Latin)
0,14,0,0,0,0,Phnom Phneas,PñMPñas,,,Steung,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,"20107, 20701","Snoeng, Sdau"
1,207,0,0,0,0,Phnom Reang,PñMraMg,,,,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,60701,Boeng Lvea
2,233,0,0,0,0,Phnom Konsat,PñMkUnstV,,,Konsat,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,70708,Koun Satv
3,234,0,0,0,0,Phnom Toteung,PñMTTWg,,,Steung Keo,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,70715,Stueng Kaev
4,269,0,0,0,0,Chang Krang,c®gÁg,,,Chang Krang,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,"100602, 100607","Changkrang, Sambok"
5,312,0,0,0,0,Svay Chras 1,sVayRCH 1,,,Svay Chras,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,100505,Svay Chreah
6,313,0,0,0,0,Svay Chras 2,sVayRCH 2,,,Svay Chras,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,"100104, 100505","Kampong Damrei, Svay Chreah"
7,316,0,0,0,0,O Da,GUrda,,,Chang Krang,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,"100603, 100602","Dar, Changkrang"
8,332,0,0,0,0,CBPF,,,,,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,"110104, 110105","Srae Khtum, Srae Preah"
9,333,0,0,0,0,Ou Nglav,guRGØav,,,,...,,,,-9999,,,"not matched based on CommGis, not matched by h...",,"110206, 110205","Srae Sangkum, Srae Huy"


### Put the 36 CFs back to the 603 already matched

In [25]:
CF603_gdf = CF_gdf[~CF_gdf.Code_Comm_DB23.isna()].drop(columns='Commune_DB23_cln')

#### Align column names with the 603 CFs

In [26]:
CF603_gdf.columns[-4:]

Index(['CommGis', 'Code_Comm_DB23', 'Commun_DB23', 'Commun_DB23_Note'], dtype='object')

In [27]:
CF36_w_DB23CommLst.columns[-6:]

Index(['Code_Comm_DB23', 'Commun_DB23', 'Commun_DB23_Note', 'Commune_DB23_cln',
       'Code', 'Name (Latin)'],
      dtype='object')

In [28]:
CF36_w_DB23CommLst['Code_Comm_DB23'] = CF36_w_DB23CommLst['Code']
CF36_w_DB23CommLst['Commun_DB23'] = CF36_w_DB23CommLst['Name (Latin)']

note how the 36 are matched

In [29]:
CF36_w_DB23CommLst['Commun_DB23_Note'] = 'not matched based on CommGis, \
matched through codes of commune boundaries intersecting with the CF'

In [30]:
CF36_w_DB23CommLst.drop(columns=['Code', 'Name (Latin)', 'Commune_DB23_cln'], inplace=True)

#### Append 36 & 603

In [31]:
all_CF_gdf = pd.concat([CF603_gdf, CF36_w_DB23CommLst], ignore_index=True)
all_CF_gdf.shape

(639, 54)

### Export

In [32]:
all_CF_gdf.drop(columns='Unnamed: 0', inplace=True)
all_CF_gdf

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,...,Commune_matched,District_matched,District_cln,Commune_cln,Village,VillGis,CommGis,Code_Comm_DB23,Commun_DB23,Commun_DB23_Note
0,10301,103,10302,3,Kon Khleaeng,kUnExøg,,,Poy Char,e)a:ycar,...,Poy Char,,na,poy char,Paoy Snuol\nPaoy Char\nTrapeang Thma Tboung\nT...,1030201\n1030202\n1030203\n1030204\n1030205\n1...,10302,10302,Poy Char,matched based on CommGis
1,10405,104,10402,2,Phnom Chuncheang,PMñCBa¢aMg,,,Chub Veary,Cb;varI,...,Chob Veari,Preah Netr Preah,preah netr preah,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis
2,10403,104,10402,1,Trapeang Russei Lech,RtBaMgb¤sISlic,,,Chub Veary,Cb;varI,...,Chob Veari,Preah Netr Preah,preah netr preah,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis
3,10402,104,10402,1,Kiri Kamnobtrorb Sinjkeut Chrorbthmei,KIrIkMNb;RTBüsIujkWtRCab;fµI,,,Chub Veary,Cb;varI,...,Chob Veari,Preah Netr Preah,preah netr preah,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis
4,10404,104,10402,1,Trapeang Russei Kaeut,RtBaMgb¤sISekIt,,,Chub Veary,Cb;varI,...,Chob Veari,Preah Netr Preah,preah netr preah,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,0,0,0,0,Chros Phaaok,eRCaHep¥ak,,,,,...,,,,,,,-9999,"210411, 210405","Saom, Kiri Chong Kaoh","not matched based on CommGis, matched through ..."
635,0,0,0,0,Payong Kao,)ay:g;ekar,,,,,...,,,,,,,-9999,"210405, 210408, 210402","Kiri Chong Kaoh, Prey Ampok, Preah Bat Choan Chum","not matched based on CommGis, matched through ..."
636,0,0,0,0,Samraong,sMerag,,,,,...,,,,,,,-9999,"210903, 210910","Kus, Samraong","not matched based on CommGis, matched through ..."
637,0,0,0,0,Kous,KUs,,,,,...,,,,,,,-9999,"210913, 210903","Tram Kak, Kus","not matched based on CommGis, matched through ..."


In [33]:
# all_CF_gdf.to_csv(
#     intmd_outfd_path / 'CF_w_match_helper_vill_DB23_comm_autoJoined_matchedAll.csv',
#     index=False
# )

## ChatGPT [aborted]

#### Step 1: Attempting to match using Code_Srok and Code_Khum (of CF) against the commune codes in the reference dataset (DB23)


In [None]:
db_comm_df = db_df[db_df.Type == 'ឃុំ']

In [None]:
# Apply the matching function to the subset data
CF_vlst_df['Matched_Commune_Name'], CF_vlst_df['Matched_Commune_Code'] = zip(
    *CF_vlst_df.apply(
        lambda row: match_commune_by_codes(row, ref_data=db_comm_df), axis=1
    )
)

In [None]:
# Display the results of the matching
show(CF_vlst_df[
    ['Code_Srok', 'Code_Khum', 'Matched_Commune_Code', 'Commune', 'Matched_Commune_Name', 'UniqueID']
])

CF UniqueID 170 matched with uncertainty, CF 31, 41, 14, 359, 295, 300, 301 partially matched.