This notebook trys to match / cross-reference the CF villages in the CF boundary data `All_CF_Cambodia_July_2016_DISES_Attribute_Update_Dec-2022-599cf_v2-4_Province_90_village_subset_3-6-23_v2.shp` to the village names in the CDB data `CDB Nat Data 2016 En_Received Dec2017_230207_Reem.xlsx`.

This notebook enriches the CF data with information helpful for matching.

## setup

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

<IPython.core.display.Javascript object>

In [3]:
code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'
intmd_outfd_path = code_path.parent.parent / 'output_intmd' / 'match_CF_CDB'

## read in data

### CDB socio-economic data, Reem updated 20230207


In [None]:
CDB_path = datafd_path / 'other' / 'CDB Nat Data 2016 En_Received Dec2017_230207_Reem.xlsx'
# CDB_df_dic = pd.read_excel(CDB_path, sheet_name=None)  # all sheets, key=sheet name
CDB_df_dic = pd.read_excel(CDB_path, sheet_name=['V_2016_E(1)', 'V_2016_E(2)', 'V_2016_E(3)'])
sht_name_lst = list(CDB_df_dic.keys())
# sheet names: ['Q_2016_E', 'V_2016_E(1)', 'V_2016_E(2)', 'V_2016_E(3)', 'C_2016_E(1)', 'C_2016_E(2)', 'D_2016_E(1)']

##### village-level CDB data

In [None]:
vill_sht_name_lst = [sht for sht in sht_name_lst if sht.startswith('V') ]
CDB_v_df_lst = [CDB_df_dic[sht] for sht in vill_sht_name_lst]

### CF data with match helper

In [None]:
CF_path = intmd_outfd_path / 'CF_w_match_helper.csv'
CF_df = pd.read_csv(CF_path)

### CF boundary data, 90 CFs with CBNRM PaFF3 connected, Alex emailed 20230306

In [None]:
CF_CBNRM_matched_path = datafd_path / 'CF' / 'Cambodia' / \
    'All_CF_Cambodia_July_2016_DISES_Attribute_Update_Dec-2022-599cf_v2-4_Province_90_village_subset_3-6-23_v2' / \
    'All_CF_Cambodia_July_2016_DISES_Attribute_Update_Dec-2022-599cf_v2-4_Province_90_village_subset_3-6-23_v2.shp'

In [None]:
CF_excel_gdf = gpd.read_file(CF_CBNRM_matched_path)  # EPSG:3148

## explore data

### CF

#### confirm csv has all info in shp

In [61]:
CF_collst = CF_gdf.columns[:-1]  # non-geometry columns

In [62]:
CF_gdf_df = pd.DataFrame(CF_gdf.sort_values('UniqueID'))[CF_collst]
CF_df2 = CF_df.sort_values('UniqueID')[CF_collst]

In [63]:
CF_gdf_df.shape == CF_df2.shape

True

In [47]:
CF_gdf_df.compare(CF_df2)  # just some rounding issues?

Unnamed: 0_level_0,Date_Agree,Date_Agree,HECTARES,HECTARES
Unnamed: 0_level_1,self,other,self,other
Loading... (need help?),,,,


#### explore key columns for joining

In [50]:
CF_df.columns

Index(['CF_Code', 'Code_Srok', 'Code_Khum', 'No_Village', 'CF_Name_En',
       'CF_Name_Kh', 'Villag_CDB', 'CF_Name_Note', 'Commune', 'Khum',
       'Commun_CDB', 'Commu_Note', 'District', 'Srok', 'Distri_CDB',
       'Distr_Note', 'Province', 'Division', 'Date_Praka', 'No_Prakas',
       'Date_Agree', 'Remarks', 'Code_CF', 'HECTARES', 'Yea_Agreem',
       'Agreement', 'NCF_statis', 'Year_CFMP', 'FA_Can', 'UniqueID',
       'Why_Remove', 'Overlap_Pa', 'geometry', 'Comm_Miss', 'Dist_Miss',
       'Comm_Match_CDB', 'Dist_Match_CDB', 'Comm_Uniq_CF', 'Comm_Uniq_CDB',
       'Dist_Uniq_CF', 'CF_Match', 'CommORDist', 'CoORDiOR2'],
      dtype='object')

In [52]:
CF_df.isnull().sum()  # no missing Commu_Note / Distr_Note 

Unnamed: 0,0
Loading... (need help?),


In [53]:
CF_df.Commu_Note.value_counts()

Unnamed: 0,Commu_Note
Loading... (need help?),


In [54]:
CF_df.Distr_Note.value_counts()

Unnamed: 0,Distr_Note
Loading... (need help?),


In [163]:
CF_df[['Commu_Note', 'Distr_Note']].value_counts()#.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Commu_Note,Distr_Note,Unnamed: 2_level_1
Loading... (need help?),,


## preprocess data

### CF data with match helper

#### consolidate commune and district name columns for joining, using match helper info

In [148]:
def append_CDB_comm_dist(helper_s):
    '''Appends CDB commune and district names'''
    comm_CF, dist_CF, comm_note, dist_note, comm_CDB, dist_CDB = helper_s[
        ['Commune', 'District', 'Commu_Note', 'Distr_Note', 'Commun_CDB', 'Distri_CDB']
    ]

    if comm_note == 'matches with CDB':
        comm = comm_CF
        if dist_note == 'NA because commune name uniquely identifies the commune in both this dataset and CDB':
            dist = 'NA'
        elif dist_note == 'matches with CDB':
            dist = dist_CF
        elif dist_note.startswith('updated to match with CDB'):
            dist = dist_CDB
        else:
            print(f'Distr_Note not handled: {dist_note}')
    elif comm_note.startswith('updated to match with CDB'):
        comm = comm_CDB
        if dist_note == 'matches with CDB':
            dist = dist_CF
        elif dist_note.startswith('updated to match with CDB'):
            dist = dist_CDB
        else:
            print(f'Distr_Note not handled: {dist_note}')
    elif comm_note == 'no match in CDB':
        comm = np.nan
        if dist_note == 'matches with CDB':
            dist = dist_CF
        elif dist_note.startswith('updated to match with CDB'):
            dist = dist_CDB
        elif dist_note in ['missing', 'no match in CDB']:
            dist = np.nan
        else:
            print(f'Distr_Note not handled: {dist_note}')
    elif comm_note == 'missing':
        comm = np.nan
        if dist_note == 'matches with CDB':
            dist = dist_CF
        elif dist_note.startswith('updated to match with CDB'):
            dist = dist_CDB
        elif dist_note in ['missing', 'no match in CDB']:
            dist = np.nan
        else:
            print(f'Distr_Note not handled: {dist_note}')
    else:
        print(f'Commun_Note not handled: {comm_note}')
        
    helper_s['Commune_matched'] = comm
    helper_s['District_matched'] = dist
    return helper_s

In [149]:
CF_matched_df = CF_df.apply(
    append_CDB_comm_dist,
    axis=1
)

#### explore consolidated data

In [150]:
(CF_matched_df['Commune_matched'] == CF_matched_df['Commune']).sum()

228

In [160]:
CF_matched_df[
    (CF_matched_df['Commune_matched'] == CF_matched_df['Commune']) & \
    (CF_matched_df['Commu_Note'] != 'matches with CDB')
]
# Commune matches with CDB, but noted as updated to match with CDB
# ok since "updated" name is in Commun_CDB

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,Commun_CDB,Commu_Note,District,Srok,Distri_CDB,Distr_Note,Province,Division,Date_Praka,No_Prakas,Date_Agree,Remarks,Code_CF,HECTARES,Yea_Agreem,Agreement,NCF_statis,Year_CFMP,FA_Can,UniqueID,Why_Remove,Overlap_Pa,geometry,Comm_Miss,Dist_Miss,Comm_Match_CDB,Dist_Match_CDB,Comm_Uniq_CF,Comm_Uniq_CDB,Dist_Uniq_CF,CF_Match,CommORDist,CoORDiOR2,Commune_matched,District_matched
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [162]:
(CF_matched_df['Commune_matched'] == CF_matched_df['Commun_CDB']).sum()

374

In [164]:
CF_matched_df[
    (CF_matched_df['Commune_matched'] == CF_matched_df['Commun_CDB']) & \
    (CF_matched_df['Commu_Note'] != 'updated to match with CDB')
]
# updated to match with CDB, INFERRED

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,Commun_CDB,Commu_Note,District,Srok,Distri_CDB,Distr_Note,Province,Division,Date_Praka,No_Prakas,Date_Agree,Remarks,Code_CF,HECTARES,Yea_Agreem,Agreement,NCF_statis,Year_CFMP,FA_Can,UniqueID,Why_Remove,Overlap_Pa,geometry,Comm_Miss,Dist_Miss,Comm_Match_CDB,Dist_Match_CDB,Comm_Uniq_CF,Comm_Uniq_CDB,Dist_Uniq_CF,CF_Match,CommORDist,CoORDiOR2,Commune_matched,District_matched
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [168]:
(CF_matched_df['District_matched'] == CF_matched_df['District']).sum()

282

In [170]:
(CF_matched_df['District_matched'] == CF_matched_df['Distri_CDB']).sum()

279

#### strip leading/trailing spaces and convert to lower case

In [171]:
for col in ['District', 'Commune']:
    CF_matched_df[f'{col}_cln'] = CF_matched_df[f'{col}_matched'].astype(str).str.strip().str.lower()

#### explore cleaned data

In [184]:
CF_matched_df.loc[:, CF_matched_df.columns.str.startswith('Commune')]

Commune,Commune_matched,Commune_cln
Loading... (need help?),,


In [183]:
CF_matched_df.loc[:, CF_matched_df.columns.str.startswith('District')]

District,District_matched,District_cln
Loading... (need help?),,


In [211]:
CF_matched_df[CF_matched_df.District_cln == 'na']  # Distr_Note == 'NA because commune name uniquely identifies the commune in both this dataset and CDB'

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,Commun_CDB,Commu_Note,District,Srok,Distri_CDB,Distr_Note,Province,Division,Date_Praka,No_Prakas,Date_Agree,Remarks,Code_CF,HECTARES,Yea_Agreem,Agreement,NCF_statis,Year_CFMP,FA_Can,UniqueID,Why_Remove,Overlap_Pa,geometry,Comm_Miss,Dist_Miss,Comm_Match_CDB,Dist_Match_CDB,Comm_Uniq_CF,Comm_Uniq_CDB,Dist_Uniq_CF,CF_Match,CommORDist,CoORDiOR2,Commune_matched,District_matched,District_cln,Commune_cln
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### CDB

#### fix headers of village-level CDB data

In [193]:
CDB_v_df_lst = [df.rename(columns=df.iloc[3]).tail(-4) for df in CDB_v_df_lst]

#### join 3 village sheets

In [194]:
CDB_v_df = pd.concat(CDB_v_df_lst, axis=1)

#### select columns in CDB

In [195]:
CDB_v_df = CDB_v_df.iloc[:, :5]

#### explore commune and district name columns for joining

In [197]:
CDB_v_df.isnull().sum()  # no missing

Unnamed: 0,0
Loading... (need help?),


#### strip leading/trailing spaces and convert to lower case

In [199]:
for col in ['Province', 'District', 'Commune', 'Village']:
    CDB_v_df[f'{col}_cln'] = CDB_v_df[col].astype(str).str.strip().str.lower()

#### explore commune and district name columns for joining

In [176]:
CDB_v_df.isnull().sum()  # no missing

Unnamed: 0,0
Loading... (need help?),


## enrich the CF data with the CDB villages in the commune each CF belongs to

### join

In [212]:
# split CFs based on if the commune can/cannot uniquely identify a record in
# the 2 datasets to be joined
CF_dup_comm_df = CF_matched_df[CF_matched_df['District_cln'] != 'na']
CF_uniq_comm_df = CF_matched_df[CF_matched_df['District_cln'] == 'na']  # Distr_Note == 'NA because commune name uniquely identifies the commune in both this dataset and CDB'

In [216]:
# if not unique, join based on commune + district
CF_villInComm_df1 = CF_dup_comm_df.merge(
    CDB_v_df,
    how='left',
    on=['District_cln', 'Commune_cln'],
    suffixes=(None, '_CDB'),
    indicator=True
)
# if unique, join based on commune
CF_villInComm_df2 = CF_uniq_comm_df.merge(
    CDB_v_df,
    how='left',
    on=['Commune_cln'],
    suffixes=(None, '_CDB'),
    indicator=True
)

In [224]:
CF_villInComm_df2.drop(columns='District_cln_CDB', inplace=True)  # b/c not merged as in df1, 

In [229]:
CF_villInComm_df = pd.concat([CF_villInComm_df1, CF_villInComm_df2])

#### explore joined data

In [218]:
CF_villInComm_df1._merge.value_counts()  # 47 could not join

Unnamed: 0,_merge
Loading... (need help?),


In [220]:
CF_villInComm_df2._merge.value_counts()  # all joined!

Unnamed: 0,_merge
Loading... (need help?),


In [221]:
CF_villInComm_df1[CF_villInComm_df1._merge == 'left_only'].Province.value_counts()
# 7/47 in focal provinces

Unnamed: 0,Province
Loading... (need help?),


In [230]:
CF_villInComm_df.shape

(5070, 55)

In [231]:
CF_villInComm_df

Unnamed: 0,CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,Commun_CDB,Commu_Note,District,Srok,Distri_CDB,Distr_Note,Province,Division,Date_Praka,No_Prakas,Date_Agree,Remarks,Code_CF,HECTARES,Yea_Agreem,Agreement,NCF_statis,Year_CFMP,FA_Can,UniqueID,Why_Remove,Overlap_Pa,geometry,Comm_Miss,Dist_Miss,Comm_Match_CDB,Dist_Match_CDB,Comm_Uniq_CF,Comm_Uniq_CDB,Dist_Uniq_CF,CF_Match,CommORDist,CoORDiOR2,Commune_matched,District_matched,District_cln,Commune_cln,VillGis,Province_CDB,District_CDB,Commune_CDB,Village,Province_cln,Village_cln,_merge
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### aggregate

In [236]:
CF_villInComm_df.groupby(['UniqueID'])['Commune_CDB'].nunique().value_counts()
# 592 CFs have 1 unique commune associated with them

Unnamed: 0,Commune_CDB
Loading... (need help?),


In [246]:
# aggregate from village-level to commune-level
CFID_villInCommLst_df = CF_villInComm_df.groupby(['UniqueID']).agg(
    {
        var: lambda x: '\n'.join(x.astype(str)) for var in ['Village', 'VillGis']
    }  # concatenate all village names in the commune with \n as delimiter, same for village IDs
)

### enrich

In [250]:
# enrich the CF data with the concatenated village names and IDs
CF_villInCommLst_df = CF_matched_df.merge(
    CFID_villInCommLst_df,
    how='inner',
    on='UniqueID',
    validate='1:1'
)

## export data

In [252]:
# CF_villInCommLst_df.to_csv(intmd_outfd_path / 'CF_w_match_helper_vill.csv', index=False)

In [253]:
CF_villInCommLst_df.columns

Index(['CF_Code', 'Code_Srok', 'Code_Khum', 'No_Village', 'CF_Name_En',
       'CF_Name_Kh', 'Villag_CDB', 'CF_Name_Note', 'Commune', 'Khum',
       'Commun_CDB', 'Commu_Note', 'District', 'Srok', 'Distri_CDB',
       'Distr_Note', 'Province', 'Division', 'Date_Praka', 'No_Prakas',
       'Date_Agree', 'Remarks', 'Code_CF', 'HECTARES', 'Yea_Agreem',
       'Agreement', 'NCF_statis', 'Year_CFMP', 'FA_Can', 'UniqueID',
       'Why_Remove', 'Overlap_Pa', 'geometry', 'Comm_Miss', 'Dist_Miss',
       'Comm_Match_CDB', 'Dist_Match_CDB', 'Comm_Uniq_CF', 'Comm_Uniq_CDB',
       'Dist_Uniq_CF', 'CF_Match', 'CommORDist', 'CoORDiOR2',
       'Commune_matched', 'District_matched', 'District_cln', 'Commune_cln',
       'Village', 'VillGis'],
      dtype='object')