EDA of DB23 (`ncdd_admin_database_25provinces__2023.xlsx` from Lok on 20231212)

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from itables import show

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils.explore_DB23 import *

In [4]:
# paths
code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'
intmd_outfd_path = code_path.parent.parent / 'output_intmd' / 'match_CF_CDB'

## Read in data

### CDB socio-economic data, Reem updated 20230207

In [None]:
CDB_path = datafd_path / 'other' / 'CDB Nat Data 2016 En_Received Dec2017_230207_Reem.xlsx'
# CDB_df_dic = pd.read_excel(CDB_path, sheet_name=None)  # all sheets, key=sheet name
CDB_df_dic = pd.read_excel(CDB_path, sheet_name=['C_2016_E(1)', 'C_2016_E(2)'])
sht_name_lst = list(CDB_df_dic.keys())
# sheet names: ['Q_2016_E', 'V_2016_E(1)', 'V_2016_E(2)', 'V_2016_E(3)', 'C_2016_E(1)', 'C_2016_E(2)', 'D_2016_E(1)']

#### commune-level CDB data

In [None]:
comm_sht_name_lst = [sht for sht in sht_name_lst if sht.startswith('C') ]
CDB_c_df_lst = [CDB_df_dic[sht] for sht in comm_sht_name_lst]
# Fix headers
CDB_c_df_lst = [df.rename(columns=df.iloc[3]).tail(-4) for df in CDB_c_df_lst]

### CF data with match helper 
CF communes and districts names matched to CDB ones

a list of CDB villages in the commune each CF belongs to

and more

In [5]:
CF_vlst_df = pd.read_csv(intmd_outfd_path / 'CF_w_match_helper_vill.csv')

## Initial processing of DB23

### Read in DB23 and stack all sheets vertically into `db_df`

In [6]:
DB23_path = datafd_path / 'other' / 'ncdd_admin_database_25provinces__2023.xlsx'
DB23_df_dic = pd.read_excel(DB23_path, sheet_name=None, header=2)  
# all sheets, key=sheet name, skip first 2 rows, make row 3 header

In [7]:
sht_name_lst = list(DB23_df_dic.keys())
db_prov_df_lst = [DB23_df_dic[sht] for sht in sht_name_lst]

In [None]:
# prov1_df = db_prov_df_lst[0]
# prov1_columns = prov1_df.columns
# nrow = prov1_df.shape[0]
# for df in db_prov_df_lst[1:]:
#     assert (df.columns == prov1_columns).all()
#     print(df.shape)
#     nrow = nrow + df.shape[0]
# nrow

In [8]:
db_df = pd.concat(db_prov_df_lst)

## CDB communes matched to CF communes: are they in DB23? 

### Explore data

#### CF data with match helper

In [9]:
CF_vlst_df.loc[
    ~CF_vlst_df.VillGis.astype(str).apply(commun_code_is_unique),
    # 'VillGis'
]  # 2 CFs have multiple commune codes in the list of village codes (VillGis)

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,...,Dist_Uniq_CF,CF_Match,CommORDist,CoORDiOR2,Commune_matched,District_matched,District_cln,Commune_cln,Village,VillGis
270,0,0,0,0,Romeas Pun Mchul,rmasb:unmúl,,,Kan Tout,knÞÜt,...,0,False,False,False,Kantuot,Chetr Borei,chetr borei,kantuot,A Loch\r\nAntong Vien\r\nChrava\r\nKantuot\r\n...,10060705\r\n10060706\r\n10060707\r\n10060708\r...
353,220412,2204,220404,16,Ratanak Rukha,rtn³rukça,,,Samrorng,sMerag,...,0,False,False,False,Samraong,Samraong,samraong,samraong,Kansaom Ak\nDei Kraham\nPrey Totueng\nSamraong...,21070801\n21070802\n21070803\n21070804\n210708...


### Preprocess data

#### commune-level CDB data

##### first sheet

In [None]:
CDB_c_df0 = CDB_c_df_lst[0]

In [None]:
# Convert commune code to int
CDB_c_df0.CommGis = CDB_c_df0.CommGis.astype(int)

#### CF data with match helper

In [10]:
# Extract commune code as int (-9999 if not possible)
CF_vlst_df['CommGis'] = CF_vlst_df.VillGis.astype(str).apply(extract_first_commun_code)

In [11]:
# All CFs with valid commune codes have commune names in `Commune_cln` column
CF_vlst_df.loc[CF_vlst_df.CommGis != -9999, 'Commune_cln'].sort_values().unique()

array(['2 thnu', 'aekakpheap', 'amleang', 'ampil pram daeum',
       'anhchanh rung', 'anlong chrey', 'anlong phe', 'anlong tnaot',
       'anlong veaeng', 'anlong vil', 'ba tang', 'bak sna', 'bakong',
       'ballangk', 'bansay reak', 'banteay preal', 'bar kham',
       'bar yakha', 'bati', 'beng', 'boeng char', 'boeng kantuot',
       'boeng mealea', 'bu sra', 'cha ung', 'chambak', 'chamkar leu',
       'chamraeun phal', 'chan sa', 'cheung kreav', 'chey', 'chhaeb muoy',
       'chhaeb pir', 'chhean mukh', 'chheu teal', 'chheu tom', 'chhuk',
       'choam sangkae', 'chob veari', 'chranouk', 'chres', 'chroab',
       'chrouy neang nguon', 'chumnoab', 'daeum doung', 'dak dam',
       'dam daek', 'dambouk khpos', 'damrei phong', 'damrei slab',
       'dang kambet', 'dang peaeng', 'dar', 'doun kaev', 'haong samnam',
       'ka choun', 'ka laeng', 'kak', 'kakaoh', 'kalai', 'kamphun',
       'kampong cham', 'kampong damrei', 'kampong kor', 'kampong pou',
       'kampong seila', 'kampong sra

#### DB23

In [12]:
db_df  # from section "Initial processing of DB23"

Unnamed: 0,Type,Code,Name (Khmer),Name (Latin),Reference,Official Note,Note (by Checker)
0,ស្រុក,102,មង្គលបូរី,Mongkol Borei,ប្រកាសលេខ ៤៩៣ប្រ.ក,,
1,ឃុំ,10201,បន្ទាយនាង,Banteay Neang,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,
2,ភូមិ,1020101,អូរធំ,Ou Thum,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,
3,ភូមិ,1020102,ភ្នំ,Phnum,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,
4,ភូមិ,1020103,បន្ទាយនាង,Banteay Neang,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,
...,...,...,...,...,...,...,...
941,ភូមិ,25071411,ទន្លេបិទក្រោម,Tonle Bet Kraom,Royal Degree 1445 on 31 Dec 2013 Move to Tboun...,Royal Degree 1445 on 31 Dec 2013 Move to Tboun...,
942,ភូមិ,25071412,ទន្លេបិទលើ,Tonle Bet Leu,Royal Degree 1445 on 31 Dec 2013 Move to Tboun...,Royal Degree 1445 on 31 Dec 2013 Move to Tboun...,
943,ភូមិ,25071413,យាយស,Yeay Sar,Royal Degree 1445 on 31 Dec 2013 Move to Tboun...,Royal Degree 1445 on 31 Dec 2013 Move to Tboun...,
944,ភូមិ,25071414,ទន្លេបិទ,Tonle Bet,Royal Degree 1445 on 31 Dec 2013 Move to Tboun...,Royal Degree 1445 on 31 Dec 2013 Move to Tboun...,


### Join DB23 to CF based on code

In [13]:
CF_w_DB23 = CF_vlst_df.merge(
    db_df[['Code', 'Name (Latin)']],
    how='left',
    left_on='CommGis',
    right_on='Code',
    indicator=True,
    validate='many_to_one'
)    

In [14]:
CF_w_DB23._merge.value_counts()

both          565
left_only      74
right_only      0
Name: _merge, dtype: int64

### Postprocess joined df

In [15]:
CF_w_DB23['Name_cln'] = CF_w_DB23['Name (Latin)'].str.strip().str.lower()

### Successful joins: Are CDB communes matched to CF also found in DB23? Yes!

In [16]:
CF_w_joined_DB23 = CF_w_DB23[CF_w_DB23._merge == 'both']

In [17]:
CF_w_joined_DB23.shape

(565, 54)

In [18]:
compare_df = CF_w_joined_DB23[['Commune_cln', 'Name_cln']]

In [19]:
(compare_df.iloc[:, 0] == compare_df.iloc[:, 1]).mean()  # almost all found

0.9805309734513274

In [20]:
compare_df[compare_df.iloc[:, 0] != compare_df.iloc[:, 1]]  # ALL found 
# accounting for spelling variations (at least for joined records)

Unnamed: 0,Commune_cln,Name_cln
1,chob veari,chob vari
2,chob veari,chob vari
3,chob veari,chob vari
4,chob veari,chob vari
5,preah netr preah,preak netr preah
275,ta mau,ta mao
276,chambak,chambâk
277,chambak,chambâk
278,ruessei kaev,russey keo
305,2 thnu,pir thnu


### Failed joins

In [22]:
CF_wo_DB23 = CF_w_DB23[CF_w_DB23._merge != 'both']

In [24]:
show(CF_wo_DB23, column_filters='footer')

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,Commun_CDB,Commu_Note,District,Srok,Distri_CDB,Distr_Note,Province,Division,Date_Praka,No_Prakas,Date_Agree,Remarks,Code_CF,HECTARES,Yea_Agreem,Agreement,NCF_statis,Year_CFMP,FA_Can,UniqueID,Why_Remove,Overlap_Pa,geometry,Comm_Miss,Dist_Miss,Comm_Match_CDB,Dist_Match_CDB,Comm_Uniq_CF,Comm_Uniq_CDB,Dist_Uniq_CF,CF_Match,CommORDist,CoORDiOR2,Commune_matched,District_matched,District_cln,Commune_cln,Village,VillGis,CommGis,Code,Name (Latin),_merge,Name_cln
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [43]:
show(CF_wo_DB23[[
    'District', 'Distr_Note', 'District_cln', 'Commune', 'Commu_Note', 'Commune_cln', 'CommGis'
]], column_filters='footer')

Unnamed: 0,District,Distr_Note,District_cln,Commune,Commu_Note,Commune_cln,CommGis
Loading... (need help?),,,,,,,


In [27]:
CF_wo_DB23.Province.value_counts()

Kratie           24
Takeo            13
Mundulkiri       10
Stung Treng       6
Pursat            5
Battambong        4
Kampot            3
Pailin            3
Ratanaki Kiri     3
Kampong Thom      1
Koh Kong          1
Svay Rieng        1
Name: Province, dtype: int64

In [28]:
CF_wo_DB23.Province.value_counts().sum()

74

In [30]:
CF_wo_DB23.District.value_counts().sum()

44

In [32]:
CF_wo_DB23.Distr_Note.value_counts()

updated to match with CDB                                                               29
missing                                                                                 26
matches with CDB                                                                        10
NA because commune name uniquely identifies the commune in both this dataset and CDB     4
updated to match with CDB, inferred                                                      4
no match in CDB                                                                          1
Name: Distr_Note, dtype: int64

In [35]:
CF_wo_DB23.District_cln.value_counts().drop('na').sum()  
# 29+10+4 districts matched to CDB districts

43

In [37]:
CF_wo_DB23.Commune.value_counts().sum()

47

In [39]:
CF_wo_DB23.Commune_cln.value_counts().sum()  # 36 communes matched to CDB communes

36