Goal: try to match / cross-reference the CF names (`CF_Name_En`) in CF boundary data `All_CF_Cambodia_July_2016_DISES_v1.shp` and the village names in `CDB Nat Data 2016 En_Received Dec2017_230207_Reem.xlsx`.

## setup

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

<IPython.core.display.Javascript object>

In [3]:
code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'
intmd_outfd_path = code_path.parent.parent / 'output_intmd' / 'match_CF_CDB'

## read in data

### CDB socio-economic data, Reem updated 20230207


In [5]:
CDB_path = datafd_path / 'other' / 'CDB Nat Data 2016 En_Received Dec2017_230207_Reem.xlsx'
# CDB_df_dic = pd.read_excel(CDB_path, sheet_name=None)  # all sheets, key=sheet name
CDB_df_dic = pd.read_excel(CDB_path, sheet_name=['V_2016_E(1)', 'V_2016_E(2)', 'V_2016_E(3)'])
sht_name_lst = list(CDB_df_dic.keys())
# sheet names: ['Q_2016_E', 'V_2016_E(1)', 'V_2016_E(2)', 'V_2016_E(3)', 'C_2016_E(1)', 'C_2016_E(2)', 'D_2016_E(1)']

##### village-level CDB data

In [6]:
vill_sht_name_lst = [sht for sht in sht_name_lst if sht.startswith('V') ]
CDB_v_df_lst = [CDB_df_dic[sht] for sht in vill_sht_name_lst]

### CF data with match helper

In [48]:
CF_path = intmd_outfd_path / 'CF_w_match_helper.csv'
CF_df = pd.read_csv(CF_path)

### CF boundary data, updated in 20220526 email

In [8]:
CF_path = datafd_path / 'CF' / 'Cambodia' / \
    'All_CF_Cambodia_July_2016_DISES_v1' / \
    'All_CF_Cambodia_July_2016_DISES_v1.shp'
CF_gdf = gpd.read_file(CF_path)  # EPSG:3148

## explore data

### CF

#### confirm csv has all info in shp

In [61]:
CF_collst = CF_gdf.columns[:-1]  # non-geometry columns

In [62]:
CF_gdf_df = pd.DataFrame(CF_gdf.sort_values('UniqueID'))[CF_collst]
CF_df2 = CF_df.sort_values('UniqueID')[CF_collst]

In [63]:
CF_gdf_df.shape == CF_df2.shape

True

In [47]:
CF_gdf_df.compare(CF_df2)  # just some rounding issues?

Unnamed: 0_level_0,Date_Agree,Date_Agree,HECTARES,HECTARES
Unnamed: 0_level_1,self,other,self,other
Loading... (need help?),,,,


#### explore key columns for joining

In [50]:
CF_df.columns

Index(['CF_Code', 'Code_Srok', 'Code_Khum', 'No_Village', 'CF_Name_En',
       'CF_Name_Kh', 'Villag_CDB', 'CF_Name_Note', 'Commune', 'Khum',
       'Commun_CDB', 'Commu_Note', 'District', 'Srok', 'Distri_CDB',
       'Distr_Note', 'Province', 'Division', 'Date_Praka', 'No_Prakas',
       'Date_Agree', 'Remarks', 'Code_CF', 'HECTARES', 'Yea_Agreem',
       'Agreement', 'NCF_statis', 'Year_CFMP', 'FA_Can', 'UniqueID',
       'Why_Remove', 'Overlap_Pa', 'geometry', 'Comm_Miss', 'Dist_Miss',
       'Comm_Match_CDB', 'Dist_Match_CDB', 'Comm_Uniq_CF', 'Comm_Uniq_CDB',
       'Dist_Uniq_CF', 'CF_Match', 'CommORDist', 'CoORDiOR2'],
      dtype='object')

In [52]:
CF_df.isnull().sum()  # no missing Commu_Note / Distr_Note 

Unnamed: 0,0
Loading... (need help?),


In [53]:
CF_df.Commu_Note.value_counts()

Unnamed: 0,Commu_Note
Loading... (need help?),


In [54]:
CF_df.Distr_Note.value_counts()

Unnamed: 0,Distr_Note
Loading... (need help?),


In [59]:
CF_df[['Commu_Note', 'Distr_Note']].value_counts().sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Commu_Note,Distr_Note,Unnamed: 2_level_1
Loading... (need help?),,


## preprocess data

### CF data with match helper

In [91]:
def append_CDB_comm_dist(helper_s):
    comm_CF, dist_CF, comm_note, dist_note, comm_CDB, dist_CDB = helper_s[
        ['Commune', 'District', 'Commu_Note', 'Distr_Note', 'Commun_CDB', 'Distri_CDB']
    ]
    if comm_note == 'matches with CDB':
        comm = comm_CF
        if dist_note == 'NA because commune name uniquely identifies the commune in both this dataset and CDB':
            dist = np.nan
        elif dist_note == 'matches with CDB':
            dist = dist_CF
        elif dist_note.startswith('updated to match with CDB'):
            dist = dist_CDB
        else:
            print(f'Distr_Note not handled: {dist_note}')
    elif comm_note.startswith('updated to match with CDB'):
        comm = comm_CDB
        if dist_note == 'matches with CDB':
            dist = dist_CF
        elif dist_note.startswith('updated to match with CDB'):
            dist = dist_CDB
        else:
            print(f'Distr_Note not handled: {dist_note}')
    elif comm_note == 'no match in CDB':
        comm_note = np.nan
        if dist_note == 'matches with CDB':
            dist = dist_CF
        elif dist_note.startswith('updated to match with CDB'):
            dist = dist_CDB
        elif dist_note in ['missing', 'no match in CDB']:
            dist = np.nan
        else:
            print(f'Distr_Note not handled: {dist_note}')
    elif comm_note == 'missing':
    else:
        print(f'Commun_Note not handled: {comm_note}')
        
    helper_s['Commune_matched'] = comm
    helper_s['District_matched'] = dist
    return helper_s

In [92]:
CF_df.apply(
    append_CDB_comm_dist,
    axis=1
)

CF_Code,Code_Srok,Code_Khum,No_Village,CF_Name_En,CF_Name_Kh,Villag_CDB,CF_Name_Note,Commune,Khum,Commun_CDB,Commu_Note,District,Srok,Distri_CDB,Distr_Note,Province,Division,Date_Praka,No_Prakas,Date_Agree,Remarks,Code_CF,HECTARES,Yea_Agreem,Agreement,NCF_statis,Year_CFMP,FA_Can,UniqueID,Why_Remove,Overlap_Pa,geometry,Comm_Miss,Dist_Miss,Comm_Match_CDB,Dist_Match_CDB,Comm_Uniq_CF,Comm_Uniq_CDB,Dist_Uniq_CF,CF_Match,CommORDist,CoORDiOR2,Commune_matched
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [93]:
CF_df.columns

Index(['CF_Code', 'Code_Srok', 'Code_Khum', 'No_Village', 'CF_Name_En',
       'CF_Name_Kh', 'Villag_CDB', 'CF_Name_Note', 'Commune', 'Khum',
       'Commun_CDB', 'Commu_Note', 'District', 'Srok', 'Distri_CDB',
       'Distr_Note', 'Province', 'Division', 'Date_Praka', 'No_Prakas',
       'Date_Agree', 'Remarks', 'Code_CF', 'HECTARES', 'Yea_Agreem',
       'Agreement', 'NCF_statis', 'Year_CFMP', 'FA_Can', 'UniqueID',
       'Why_Remove', 'Overlap_Pa', 'geometry', 'Comm_Miss', 'Dist_Miss',
       'Comm_Match_CDB', 'Dist_Match_CDB', 'Comm_Uniq_CF', 'Comm_Uniq_CDB',
       'Dist_Uniq_CF', 'CF_Match', 'CommORDist', 'CoORDiOR2'],
      dtype='object')

### CDB

##### fix headers of village-level CDB data

In [9]:
CDB_v_df_lst = [df.rename(columns=df.iloc[3]).tail(-4) for df in CDB_v_df_lst]

##### join 3 village sheets

In [10]:
CDB_v_df = pd.concat(CDB_v_df_lst, axis=1)

##### select columns in CDB

In [11]:
CDB_v_df = CDB_v_df.iloc[:, :5]

##### strip leading/trailing spaces and convert to lower case

In [12]:
for col in ['Province', 'District', 'Commune', 'Village']:
    CDB_v_df[f'{col}_cln'] = CDB_v_df[col].str.strip().str.lower()

## enrich the CF data with the CDB villages in the commune each CF belongs to

### join

In [None]:
CDB_v_df

### aggregate

## export data