This notebook explores the CPA datasets: `Shapefile 181CPA` & `CPA DATA Eng and KH__191_up.xlsx.xlsx`. [data source](https://drive.google.com/drive/folders/1U-tEQPPZlceu0THC2tOrl6P9nexpt7eQ?usp=share_link)

The end goal is to connect them to the CDB data `CDB Nat Data 2016 En_Received Dec2017_230207_Reem.xlsx` at the village level.

In [4]:
import geopandas as gpd
import pandas as pd
import numpy as np
from pathlib import Path

In [5]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

<IPython.core.display.Javascript object>

In [6]:
code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'

## read in data

### CDB socio-economic data, Reem updated 20230207


In [None]:
CDB_path = datafd_path / 'other' / 'CDB Nat Data 2016 En_Received Dec2017_230207_Reem.xlsx'
# CDB_df_dic = pd.read_excel(CDB_path, sheet_name=None)  # all sheets, key=sheet name
CDB_df_dic = pd.read_excel(CDB_path, sheet_name=['V_2016_E(1)'])
sht_name_lst = list(CDB_df_dic.keys())
# sheet names: ['Q_2016_E', 'V_2016_E(1)', 'V_2016_E(2)', 'V_2016_E(3)', 'C_2016_E(1)', 'C_2016_E(2)', 'D_2016_E(1)']

##### village-level CDB data

In [None]:
vill_sht_name_lst = [sht for sht in sht_name_lst if sht.startswith('V') ]
CDB_v_df_lst = [CDB_df_dic[sht] for sht in vill_sht_name_lst]

In [None]:
CDB_v_df1 = CDB_v_df_lst[0]

### CPA boundary data

In [7]:
CPA_shp_path = datafd_path / 'CPA' / 'Cambodia' / 'Shapefile 181CPA' / 'CPA_Shape_31_Aug_2022.shp'
CPA_gdf = gpd.read_file(CPA_shp_path)  # EPSG:32648

### CPA spreadsheet data

In [10]:
CPA_excel_path = datafd_path / 'CPA' / 'Cambodia' / 'CPA DATA Eng and KH__191_up.xlsx.xlsx'
CPA_df = pd.read_excel(CPA_excel_path, sheet_name='CPA Data Eng', skiprows=1, skipfooter=1)

## explore data

### CPA shp

In [16]:
CPA_gdf

CPAName_Eg,Houholed,No_Village,Commune,Khum,District,Srok,Province,Kheth,PA_Name_E,PA_Name_K,Legal_Stat,CPAName_K,Hectares,Perimeter,Date_MOE_K,Date_Agree,Date_CFMP,geometry
Loading... (need help?),,,,,,,,,,,,,,,,,,


In [11]:
CPA_gdf.dtypes  # Houholed & No_Village should not be object

Unnamed: 0,0
Loading... (need help?),


In [12]:
CPA_gdf.isnull().sum()  
# no missing CPA name, Commune, District, Province, PA name, Hectares, might be useful

Unnamed: 0,0
Loading... (need help?),


In [17]:
def find_duplicate_geom(gdf, id_colname, decimal=1, geom_colname='geometry'):
    '''
    Parameters
    ----------
    gdf : geodataframe
        Dataset.
    geom_colname : string
        Name of column containing geometries.
    decimal : int
        Decimal points to consider when deciding coincidence of coordinates.
    id_colname : string
        Name of column containing geometry identifiers.
    Returns
    -------
    List of identifiers of duplicated geometries.
    '''

    geom_ser = gdf.loc[:, geom_colname]
    uniqgeoms = []
    dupids = []
    for i, geom in enumerate(geom_ser):
        if any(g.almost_equals(geom, decimal=decimal) for g in uniqgeoms):
            dupids.append(gdf.iloc[i, ][id_colname])
        else:
            uniqgeoms.append(geom)
    return dupids

In [18]:
find_duplicate_geom(CPA_gdf, id_colname='CPAName_Eg')
# 'Sre Y Noun Thoun' & 'Lav Ka' have similar/same boundaries

  if any(g.almost_equals(geom, decimal=decimal) for g in uniqgeoms):


['Sre Y Noun Thoun']

### CPA excel

In [15]:
CPA_df

NO,CPA NAME,Village,Commune,District,Province,Protected Areas,Year Estalishment,Number  Family/ Househole,CPA Area,Total CPA Committee,Women,Date Prakas,Date Agreement sighed,Date Management Signed,Other,Inactive,Unnamed: 17
Loading... (need help?),,,,,,,,,,,,,,,,,


In [13]:
CPA_df.dtypes

Unnamed: 0,0
Loading... (need help?),


In [14]:
CPA_df.isnull().sum()
# no missing CPA name, Commune, District, Province, PA name, might be useful
# few missing CPA area, # family/hh, might also be useful

Unnamed: 0,0
Loading... (need help?),


## join CPA shp and excel based on CPA name

In [19]:
CPA_gdf.merge(CPA_df)  # TODO

Unnamed: 0,CPAName_Eg,Houholed,No_Village,Commune,Khum,District,Srok,Province,Kheth,PA_Name_E,PA_Name_K,Legal_Stat,CPAName_K,Hectares,Perimeter,Date_MOE_K,Date_Agree,Date_CFMP,geometry,NO,CPA NAME,Village,Protected Areas,Year Estalishment,Number  Family/ Househole,CPA Area,Total CPA Committee,Women,Date Prakas,Date Agreement sighed,Date Management Signed,Other,Inactive,Unnamed: 17
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# old code from before

## preprocess data

### CBNRM

In [None]:
# delete extra spaces in strings
CBNRM_df = CBNRM_df.apply(lambda s: s.astype(str).str.strip() if s.dtype == 'object' else s)

In [None]:
# convert to lower case for merging with other datasets 
for col in ['CBNRM name']:
    CBNRM_df[f'{col}_lower'] = CBNRM_df[col].str.lower()

### CF

In [None]:
# delete extra spaces in strings
CF_gdf = CF_gdf.apply(lambda s: s.astype(str).str.strip() if s.dtype == 'object' else s)

In [None]:
# convert to lower case for merging with other datasets 
for col in ['CF_Name_En']:
    CF_gdf[f'{col}_lower'] = CF_gdf[col].str.lower()

## explore preprocessed data

In [None]:
CBNRM_df.isnull().sum()

In [None]:
CBNRM_df['CBNRM name_lower'].value_counts()

In [None]:
CBNRM_df

In [None]:
CF_gdf.isnull().sum()

In [None]:
CF_gdf  # has CF_Name_En Kralapeas in district Thala Borivath, not district Borey O'svaysenchey

In [None]:
CF_gdf['CF_Name_En_lower'].value_counts()

In [None]:
CBNRM_df  # no "potential area" or "samky" or "phnom srang" or "phnom roy"
# those duplicated CF names in CF data shouldn't be an issue

## join CBNRM and CF based on names

In [None]:
CBNRM_CF_common_name_df = CBNRM_df.merge(
    CF_gdf,
    how='inner',
    left_on='CBNRM name_lower',
    right_on='CF_Name_En_lower',
    suffixes=(None, '_CF'),
)

## explore joined data

In [None]:
CBNRM_CF_common_name_df.shape  # matched

In [None]:
CBNRM_df.shape  # total

### village names