This notebook enriches each CF with a list of DB23 villages in the CF commune, and identifies the one that best matches with CF name (and CF village if village is known). 

The notebook exports a CSV including existing info in `CF_df`, lists of DB23 village names and codes (`Villages_DB23`, `Village_Codes_DB23`) enriched, CF village names known for the focal provinces from CBNRM (`Village_na_CBNRM`), CDB villages matched to those CF village names (`Village_na_CDB*`), and the best names identified (`Village_na_CBNRM_fuzz_DB23` and `CF_Name_En_fuzz_DB23`).
Note a couple columns are renamed for clarity. 

DB23: `ncdd_admin_database_25provinces__2023.xlsx` from Lok on 20231212

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from itables import show

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils.fuzzy_match import *

In [4]:
# paths
code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'
intmd_outfd_path = code_path.parent.parent / 'output_intmd' / 'match_CF_DB23'

## Read in data

### CF data

with all ~~but 36 CF~~ communes matched to DB23 communes (see 'Code_Comm_DB23', 'Commun_DB23', 'Commun_DB23_Note')

In [5]:
# CF_df = pd.read_csv(intmd_outfd_path / 'CF_w_match_helper_vill_DB23_comm_autoJoined_matched.csv')
CF_df = pd.read_csv(intmd_outfd_path / 'CF_w_match_helper_vill_DB23_comm_autoJoined_matchedAll.csv')

### CF data in focal provinces

with some village names and a subset of them matched to CDB data

In [6]:
CF_4prov_df = pd.read_csv(
    code_path.parent.parent / 'output_intmd' / 'match_CF_CDB' / 'CF230306_w_CDB_village.csv'
)

### DB23

cleaned

In [7]:
db_df = pd.read_csv(intmd_outfd_path / 'ncdd_admin_database_25provinces__2023_wide_en.csv')

## Enrich CF data with a list of DB23 villages in the CF commune

### Preprocess

In [8]:
# Some CF have multiple commune codes
multiple_communes = CF_df.Code_Comm_DB23.str.contains(',', na=False)
CF_df.Code_Comm_DB23[multiple_communes]

13                               20105, 20107
32                               20904, 20905
33                               20904, 20906
344                        240203, 240204\r\n
345                        240203, 240204\r\n
346                        240203, 240204\r\n
594                            200203, 200207
596                            210904, 210911
603                              20107, 20701
607                            100602, 100607
609                            100104, 100505
610                            100603, 100602
611                            110104, 110105
612                            110206, 110205
617                    110206, 110205, 110201
618                            110204, 110206
620                            110503, 110403
621                            150405, 150406
624                            160701, 160704
625            160607, 160602, 160104, 160801
626    160605, 160604, 160602, 160904, 160906
627            210411, 210405, 210

In [9]:
# Create a separate row for each of those commune codes

# df with only those CFs
to_split_df = CF_df.loc[multiple_communes].copy()

# Splitting the comma-separated codes and exploding them into separate rows
to_split_df['Code_Comm_DB23'] = to_split_df['Code_Comm_DB23'].str.split(', ')
splitted_df = to_split_df.explode('Code_Comm_DB23')
splitted_df.reset_index(drop=True, inplace=True)

# Removing those CFs from the main dataset
clean_CF_df = CF_df[~multiple_communes]

# Appending the exploded rows to the main dataset
long_CF_df = pd.concat([clean_CF_df, splitted_df], ignore_index=True)
long_CF_df.shape

(683, 53)

In [10]:
# Clean codes
long_CF_df.Code_Comm_DB23 = long_CF_df.Code_Comm_DB23.astype('Int64').astype(str)
db_df['Commune Code'] = db_df['Commune Code'].astype(str)

### Join DB23 villages to CF based on commune code

In [11]:
CF_villInComm_df = long_CF_df.merge(
    db_df,
    how='left',
    left_on='Code_Comm_DB23',
    right_on='Commune Code',
    suffixes=(None, '_DB23'),
    indicator=True
)

In [12]:
CF_villInComm_df._merge.value_counts()

both          6040
left_only        0
right_only       0
Name: _merge, dtype: int64

### Aggregate villages to lists: one per CF

In [13]:
# Clean village code
CF_villInComm_df['Village Code'] = CF_villInComm_df['Village Code'].astype('Int64')

In [14]:
# Aggregate villages per CF
CFID_villInCommLst_df = CF_villInComm_df.groupby(['UniqueID']).agg(
    {
        var: lambda x: '\n'.join(x.astype(str)) for var in ['Village Name', 'Village Code']
    }  # concatenate all village names in the commune with \n as delimiter, same for village IDs
)
CFID_villInCommLst_df.head(2)

Unnamed: 0_level_0,Village Name,Village Code
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Paoy Kdoeang\nCheung Voat\nKandal\nPost Chas\n...,1040501\n1040502\n1040503\n1040504\n1040505\n1...
2,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...


In [15]:
# Enrich the CF data with the concatenated village names and codes
CF_villInCommLst_df = CF_df.merge(
    CFID_villInCommLst_df,
    how='inner',
    on='UniqueID',
    validate='1:1'
)

In [16]:
CF_villInCommLst_df.head(2)

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,...,District_cln,Commune_cln,Village,VillGis,CommGis,Code_Comm_DB23,Commun_DB23,Commun_DB23_Note,Village Name,Village Code
0,10301,103,10302,3,Kon Khleaeng,kUnExøg,,,Poy Char,e)a:ycar,...,na,poy char,Paoy Snuol\nPaoy Char\nTrapeang Thma Tboung\nT...,1030201\n1030202\n1030203\n1030204\n1030205\n1...,10302,10302,Poy Char,matched based on CommGis,Paoy Snuol\nPaoy Char\nTrapeang Thma Tboung\nT...,1030201\n1030202\n1030203\n1030204\n1030205\n1...
1,10405,104,10402,2,Phnom Chuncheang,PMñCBa¢aMg,,,Chub Veary,Cb;varI,...,preah netr preah,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...


## Identify the one that best matches with CF name 

(and the one that best matches with CF village if village is known)

In [17]:
# Enrich the CF data with known CF villages
CF_vills_df = CF_villInCommLst_df.merge(
    CF_4prov_df[['UniqueID', 'Village_na', 'Village_na_CDB', 'Village_na_CDB_note']],
    how='left',
    on='UniqueID',
    validate='1:1'
)

In [18]:
# Rename columns
CF_vills_df.rename(columns={
    'Village': 'Villages_CDB',
    'VillGis': 'Village_Codes_CDB',
    'Village Name': 'Villages_DB23', 
    'Village Code': 'Village_Codes_DB23',
    'Village_na': 'Village_na_CBNRM'
}, inplace=True)

In [19]:
CF_vills_df.head(2)

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,...,Village_Codes_CDB,CommGis,Code_Comm_DB23,Commun_DB23,Commun_DB23_Note,Villages_DB23,Village_Codes_DB23,Village_na_CBNRM,Village_na_CDB,Village_na_CDB_note
0,10301,103,10302,3,Kon Khleaeng,kUnExøg,,,Poy Char,e)a:ycar,...,1030201\n1030202\n1030203\n1030204\n1030205\n1...,10302,10302,Poy Char,matched based on CommGis,Paoy Snuol\nPaoy Char\nTrapeang Thma Tboung\nT...,1030201\n1030202\n1030203\n1030204\n1030205\n1...,,,
1,10405,104,10402,2,Phnom Chuncheang,PMñCBa¢aMg,,,Chub Veary,Cb;varI,...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,,,


### Find best match with CF village from DB23 village list


In [20]:
CF_vills_df = CF_vills_df.apply(
    append_fuzz_match_result,
    axis=1,
    result_col='Village_na_CBNRM_fuzz_DB23', 
    string='Village_na_CBNRM', 
    pool='Villages_DB23', 
    list_string=list_Village_na_CBNRM
)

### Find best match with CF name from DB23 village list

In [21]:
CF_vills_df = CF_vills_df.apply(
    append_fuzz_match_result,
    axis=1,
    result_col='CF_Name_En_fuzz_DB23', 
    string='CF_Name_En', 
    pool='Villages_DB23', 
    list_string=lambda x: [x]
)

### Consolidate both matches

In [22]:
CF_vills_df['Village_DB23 (to update)'] = \
CF_vills_df['Village_na_CBNRM_fuzz_DB23'].combine_first(
    CF_vills_df['CF_Name_En_fuzz_DB23']
)
# A column to be updated with visually checked village names
# Matches with CF village take precedence over CF name

### export results

In [23]:
# CF_vills_df.to_csv(intmd_outfd_path / 'CF_w_match_helper_vill_DB23_comm_autoJoined_matchedAll_fuzz.csv', index=False)