This notebook enriches each CF with a list of DB23 villages in the CF commune, and identifies the one that best matches with CF name (or CF village if village is known).

DB23: `ncdd_admin_database_25provinces__2023.xlsx` from Lok on 20231212

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from itables import show
from fuzzywuzzy import process

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# from utils.explore_DB23 import *

In [4]:
# paths
code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'
intmd_outfd_path = code_path.parent.parent / 'output_intmd' / 'match_CF_DB23'

## Read in data

### CF data

with all but 36 CF communes matched to DB23 communes (see 'Code_Comm_DB23', 'Commun_DB23', 'Commun_DB23_Note')

In [5]:
CF_df = pd.read_csv(intmd_outfd_path / 'CF_w_match_helper_vill_DB23_comm_autoJoined_matched.csv')

### DB23

cleaned

In [6]:
db_df = pd.read_csv(intmd_outfd_path / 'ncdd_admin_database_25provinces__2023_wide_en.csv')

## Enrich CF data with a list of DB23 villages in the CF commune

### Preprocess

In [7]:
# Some CF have multiple commune codes
multiple_communes = CF_df.Code_Comm_DB23.str.contains(',', na=False)
CF_df.Code_Comm_DB23[multiple_communes]

13           20105, 20107
33           20904, 20905
34           20904, 20906
362    240203, 240204\r\n
363    240203, 240204\r\n
364    240203, 240204\r\n
618        200203, 200207
620        210904, 210911
Name: Code_Comm_DB23, dtype: object

In [8]:
# Create a separate row for each of those commune codes

# df with only those CFs
to_split_df = CF_df.loc[multiple_communes].copy()

# Splitting the comma-separated codes and exploding them into separate rows
to_split_df['Code_Comm_DB23'] = to_split_df['Code_Comm_DB23'].str.split(', ')
splitted_df = to_split_df.explode('Code_Comm_DB23')
splitted_df.reset_index(drop=True, inplace=True)

# Removing those CFs from the main dataset
clean_CF_df = CF_df[~multiple_communes]

# Appending the exploded rows to the main dataset
long_CF_df = pd.concat([clean_CF_df, splitted_df], ignore_index=True)
long_CF_df.shape

(647, 55)

In [9]:
# Clean codes
long_CF_df.Code_Comm_DB23 = long_CF_df.Code_Comm_DB23.astype('Int64').astype(str)
db_df['Commune Code'] = db_df['Commune Code'].astype(str)

### Join DB23 villages to CF based on commune code

In [10]:
CF_villInComm_df = long_CF_df.merge(
    db_df,
    how='left',
    left_on='Code_Comm_DB23',
    right_on='Commune Code',
    suffixes=(None, '_DB23'),
    indicator=True
)

In [11]:
CF_villInComm_df._merge.value_counts()

both          5422
left_only       36
right_only       0
Name: _merge, dtype: int64

### Aggregate villages to lists: one per CF

In [12]:
# Clean village code
CF_villInComm_df['Village Code'] = CF_villInComm_df['Village Code'].astype('Int64')

In [13]:
# Aggregate villages per CF
CFID_villInCommLst_df = CF_villInComm_df.groupby(['UniqueID']).agg(
    {
        var: lambda x: '\n'.join(x.astype(str)) for var in ['Village Name', 'Village Code']
    }  # concatenate all village names in the commune with \n as delimiter, same for village IDs
)
CFID_villInCommLst_df.head(2)

Unnamed: 0_level_0,Village Name,Village Code
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Paoy Kdoeang\nCheung Voat\nKandal\nPost Chas\n...,1040501\n1040502\n1040503\n1040504\n1040505\n1...
2,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...


In [14]:
# Enrich the CF data with the concatenated village names and codes
CF_villInCommLst_df = CF_df.merge(
    CFID_villInCommLst_df,
    how='inner',
    on='UniqueID',
    validate='1:1'
)

In [15]:
CF_villInCommLst_df

Unnamed: 0.1,Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,...,Commune_cln,Village,VillGis,CommGis,Code_Comm_DB23,Commun_DB23,Commun_DB23_Note,Commune_DB23_cln,Village Name,Village Code
0,0,10301,103,10302,3,Kon Khleaeng,kUnExøg,,,Poy Char,...,poy char,Paoy Snuol\nPaoy Char\nTrapeang Thma Tboung\nT...,1030201\n1030202\n1030203\n1030204\n1030205\n1...,10302,10302,Poy Char,matched based on CommGis,poy char,Paoy Snuol\nPaoy Char\nTrapeang Thma Tboung\nT...,1030201\n1030202\n1030203\n1030204\n1030205\n1...
1,1,10405,104,10402,2,Phnom Chuncheang,PMñCBa¢aMg,,,Chub Veary,...,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis,chob vari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...
2,2,10403,104,10402,1,Trapeang Russei Lech,RtBaMgb¤sISlic,,,Chub Veary,...,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis,chob vari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...
3,3,10402,104,10402,1,Kiri Kamnobtrorb Sinjkeut Chrorbthmei,KIrIkMNb;RTBüsIujkWtRCab;fµI,,,Chub Veary,...,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis,chob vari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...
4,4,10404,104,10402,1,Trapeang Russei Kaeut,RtBaMgb¤sISekIt,,,Chub Veary,...,chob veari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...,10402,10402,Chob Vari,matched based on CommGis,chob vari,Chob\nRoul Chruk\nPrasat\nKrasang Thmei\nPrada...,1040201\n1040202\n1040203\n1040204\n1040205\n1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,634,30405,304,30407,1,Banghoeu Khleaeng,begIExøg,,,Trapeang Pring,...,trapeang pring,Banghaeur Khlaeng\nSrae Kak\nChambak\nPralaoh\...,25010601\n25010602\n25010603\n25010604\n250106...,250106,250106,Trapeang Pring,matched based on CommGis,trapeang pring,Banghaeu Khlaeng\nSrae Kak\nChambak\nPralaoh\n...,25010601\n25010602\n25010603\n25010604\n250106...
635,635,30403,304,30407,1,Chambak,ERskk;,,,Trapeang Pring,...,trapeang pring,Banghaeur Khlaeng\nSrae Kak\nChambak\nPralaoh\...,25010601\n25010602\n25010603\n25010604\n250106...,250106,250106,Trapeang Pring,matched based on CommGis,trapeang pring,Banghaeu Khlaeng\nSrae Kak\nChambak\nPralaoh\n...,25010601\n25010602\n25010603\n25010604\n250106...
636,636,30404,304,30407,1,Srae Kak,ERskk;,,,Trapeang Pring,...,trapeang pring,Banghaeur Khlaeng\nSrae Kak\nChambak\nPralaoh\...,25010601\n25010602\n25010603\n25010604\n250106...,250106,250106,Trapeang Pring,matched based on CommGis,trapeang pring,Banghaeu Khlaeng\nSrae Kak\nChambak\nPralaoh\n...,25010601\n25010602\n25010603\n25010604\n250106...
637,637,30402,304,30407,1,Trapeang Pring,RtBaMgRBIg,,,Trapeang Pring,...,trapeang pring,Banghaeur Khlaeng\nSrae Kak\nChambak\nPralaoh\...,25010601\n25010602\n25010603\n25010604\n250106...,250106,250106,Trapeang Pring,matched based on CommGis,trapeang pring,Banghaeu Khlaeng\nSrae Kak\nChambak\nPralaoh\n...,25010601\n25010602\n25010603\n25010604\n250106...
