Goal: compare the village information in CF boundary data `All_CF_Cambodia_July_2016_DISES_v1.shp` and `CDB Nat Data 2016 En_Received Dec2017.xlsx`, and try to match / cross-reference the two.

In [1]:
# %% import modules

import geopandas as gpd
import pandas as pd
import numpy as np
from pathlib import Path

In [49]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=False)

<IPython.core.display.Javascript object>

In [3]:
# %% read in data

code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'
# result_path = code_path / 'results'

In [4]:
# CF boundary data, updated in 20220526 email
CF_path = datafd_path / 'CF' / 'Cambodia' / \
    'All_CF_Cambodia_July_2016_DISES_v1' / \
    'All_CF_Cambodia_July_2016_DISES_v1.shp'
CF_gdf = gpd.read_file(CF_path)  # EPSG:3148

In [5]:
# CDB socio-economic data, Lok shared 20221228
CDB_path = datafd_path / 'other' / 'CDB Nat Data 2016 En_Received Dec2017.xlsx'
CDB_df_dic = pd.read_excel(CDB_path, sheet_name=None)  # all sheets, key=sheet name
sht_name_lst = list(CDB_df_dic.keys())
# sheet names: ['Q_2016_E', 'V_2016_E(1)', 'V_2016_E(2)', 'V_2016_E(3)', 'C_2016_E(1)', 'C_2016_E(2)', 'D_2016_E(1)']

#### explore CF boundary data

In [5]:
CF_gdf

CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Commune,Khum,District,Srok,Province,Division,Date_Praka,No_Prakas,Date_Agree,Remarks,Code_CF,HECTARES,Yea_Agreem,Agreement,NCF_statis,Year_CFMP,FA_Can,UniqueID,Why_Remove,Overlap_Pa,geometry
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
CF_gdf.columns

Index(['CF_Code', 'Code_Srok', 'Code_Khum', 'No_Village', 'CF_Name_En',
       'CF_Name_Kh', 'Commune', 'Khum', 'District', 'Srok', 'Province',
       'Division', 'Date_Praka', 'No_Prakas', 'Date_Agree', 'Remarks',
       'Code_CF', 'HECTARES', 'Yea_Agreem', 'Agreement', 'NCF_statis',
       'Year_CFMP', 'FA_Can', 'UniqueID', 'Why_Remove', 'Overlap_Pa',
       'geometry'],
      dtype='object')

In [7]:
CF_gdf.nunique()

Unnamed: 0,0
Loading... (need help?),


Commune, district, province info available. Village info in `No_Village`.

In [8]:
CF_gdf.No_Village.unique()  # all numeric

array([ 1.,  2.,  3., 15.,  0.,  5.,  6.,  4.,  7.,  8., 16., 10.])

In [10]:
CF_gdf.No_Village.value_counts()

Unnamed: 0,No_Village
Loading... (need help?),


In [9]:
360/639  # more than half 0's

0.5633802816901409

#### explore CDB socio-economic data

In [6]:
CDB_df_dic.keys()

dict_keys(['Q_2016_E', 'V_2016_E(1)', 'V_2016_E(2)', 'V_2016_E(3)', 'C_2016_E(1)', 'C_2016_E(2)', 'D_2016_E(1)'])

#### match village info

##### Community forest names in CF data useful?

In [86]:
# # CF names in CF data: no leading/trailing spaces
# all(CF_gdf.CF_Name_En.apply(lambda s: s.strip()) == CF_gdf.CF_Name_En) 

##### village-level CDB data

In [7]:
vill_sht_name_lst = [sht for sht in sht_name_lst if sht.startswith('V') ]

In [8]:
CDB_v_df_lst = [CDB_df_dic[sht] for sht in vill_sht_name_lst]

###### village names in CDB data: same across the 3 sheets

In [87]:
vill_name_df = pd.DataFrame()  # village names in diff. sheets
for i, df in enumerate(CDB_v_df_lst):
    # print(df.shape)  # same nrow
    # display(df.head(4))  # village info in 5th column
    vill_s = df.iloc[:, 4]
    # pd.testing.assert_series_equal(vill_s.str.strip(), vill_s)  # error
    # print(vill_s[vill_s.str.strip() != vill_s])  # vill_s[13400] has a trailing space
    vill_s = vill_s.str.strip()  
    vill_name_df[i] = vill_s

In [88]:
vill_name_df.head()

In [89]:
# village names same across sheets
assert vill_name_df[0].equals(vill_name_df[1])
assert vill_name_df[0].equals(vill_name_df[2])
assert vill_name_df[2].equals(vill_name_df[1])

##### village names in CDB data

In [78]:
vill_name_s = CDB_v_df_lst[0].iloc[:, 4].dropna()

##### Any word in CF names in village names? 

In [63]:
# words in CF names
CF_word_lst = ' '.join(CF_gdf.CF_Name_En).lower().split(' ')

In [79]:
# words in village names
vill_word_lst = ' '.join(vill_name_s).lower().split(' ')

In [80]:
common_word_set = set(CF_word_lst).intersection(vill_word_lst)  
# too many words found in both CF & village names to check manually
print(len(common_word_set))

388


##### Any CF name matches village name?

In [85]:
common_name_set = set(CF_gdf.CF_Name_En.str.lower()).intersection(
    set(vill_name_s.str.lower())
)
common_name_set

{'andoung pring',
 'anh chanh',
 'anlong mean',
 'anlong svay',
 'anlong vil',
 'ansa kdam',
 'araen',
 'bos thom',
 'chambak',
 'chbar leu',
 'cheung phleung',
 'chey chumneah',
 'chhaeb lech',
 'chhouk',
 'chor',
 'chrab',
 'chrava',
 'chreaeng',
 'damnak ampil',
 'kak',
 'kampeaeng svay',
 'kampong krabei',
 'kampong krasang',
 'kbal khla',
 'kbal romeas',
 'khla krapeu',
 'kol totueng',
 'kouk khpos',
 'kouk srok',
 'kralanh',
 'krang doung',
 'krang serei',
 'kraom',
 'krasang',
 'kravan',
 'krong',
 'monorom',
 'narong',
 'ou ba krang',
 'ou krieng',
 'ou pou',
 'ou saom',
 'ou srav',
 'ou thkov',
 'peam lvea',
 'phlov krabei',
 'phum thmei',
 'ponsay cheung',
 'popel',
 'popel lech',
 'pralay',
 'prama',
 'prasat char',
 'preaek chheu trav',
 'preaek muoy',
 'preah lean',
 'prey andoung',
 'prey banteay',
 'prey chor',
 'prey chres',
 'prey preah',
 'prey roung',
 'prey snuol',
 'prey tralach',
 'prey veang',
 'prongil',
 'roluos',
 'roluos kandal',
 'roluos khang kaeut',
 'romc

In [84]:
len(common_name_set)

103

In [90]:
103/639  # percent CF's with names the same as some village name

0.16118935837245696

#### matching commune info

##### commune-level CDB data

In [94]:
comm_sht_name_lst = [sht for sht in sht_name_lst if sht.startswith('C') ]
comm_sht_name_lst

['C_2016_E(1)', 'C_2016_E(2)']

In [97]:
CDB_c_df_lst = [CDB_df_dic[sht] for sht in comm_sht_name_lst]

###### commune names in CDB data: same across the 2 commune sheets?

In [108]:
name_df = pd.DataFrame()  # commune names in diff. sheets
for i, df in enumerate(CDB_c_df_lst):
    # print(df.shape)  # same nrow
    # display(df.head(4))  # commune info in 4th column
    name_s = df.iloc[:, 3]
    # pd.testing.assert_series_equal(name_s.str.strip(), name_s)  # error
    # print(name_s[name_s.str.strip() != name_s])
    name_s = name_s.str.strip()  
    # pd.testing.assert_series_equal(name_s.str.strip(), name_s)  # no error
    name_df[i] = name_s

In [109]:
name_df.head()

Unnamed: 0,0,1
0,,
1,,
2,,
3,Commune,Commune
4,Paoy Paet,Paoy Paet


In [110]:
# village names same across sheets
assert name_df[0].equals(name_df[1])

##### commune names in CDB commune data

In [111]:
comm_name_s = CDB_c_df_lst[0].iloc[:, 3].dropna()