# Find control villages 

In [4]:
import os 
import geopandas as gpd
import pandas as pd

In [6]:
year = 2005 #2005 is the base year for the matching process 
crs =  'EPSG:3148'

In [None]:
#Set root path
root_path = os.path.join('DISES\Proximity\data') #https://drive.google.com/open?id=1QHF6wbAcBsGpZYumx63Ub7m48eO3Ue4k&usp=drive_fs

In [16]:
# outputs path
outputs_path = os.path.join(r'\outputs')

In [18]:
#fa_shape_path = os.path.join('/Users/Daniel/Library/CloudStorage/GoogleDrive-dwiesner@sig-gis.com/My Drive/DISES/batched-predictions-branch/data/KHM/forested_areas', 'fa_70pc_129has_4conn.shp')
fa_shape_path = os.path.join(root_path, 'fa_70pc_129has_4conn.shp')
fas = gpd.read_file(fa_shape_path)
fas = fas.to_crs(crs)

In [20]:
# Import CPA shapefile
cpa_path = os.path.join(root_path, 'Community_Protected_Areas_Cambodia/CPA_Shape_31_Aug_2022.shp')
cpa = gpd.read_file(cpa_path)

# Add an ID column to the CPA starting from 1 to the total number of rows
cpa['CPA_id'] = range(1, len(cpa) + 1)
cpa = cpa.to_crs(crs)

In [22]:
cfs_path = os.path.join(root_path, 'Camboda CF updated data - Feb-2025/CF_KH_Updated_2.5.2025_EPSG32648.shp')
cfs = gpd.read_file(cfs_path)
cfs = cfs.to_crs(crs)

#Create the CF code for further identification
cfs.rename({'ObjectID':'CF_CODE'}, axis=1, inplace=True)
cfs.sort_values('CF_CODE', inplace=True)
#Drop the previous index
cfs = cfs.reset_index(drop=True)
#Create a string 3 digit code to uniquely identify the CFs
cfs['CF_CODE'] = cfs['CF_CODE'].astype(str).str.zfill(3)

### Define forests

In [25]:
from shapely.validation import make_valid
# Validate geometries to avoid topology errors
cfs['geometry'] = cfs['geometry'].apply(make_valid)
fas['geometry'] = fas['geometry'].apply(make_valid)

In [26]:
gdf1 = cfs[['CF_CODE', 'geometry']]
gdf1.columns = ['FID', 'geometry']

In [27]:
gdf2 = fas.copy(deep=True)

In [28]:
forests = pd.concat([gdf1, gdf2])
forests = forests.reset_index(drop=True)

# Find control Villages

### Import Treatment villages

(This corresponds to treatmente villages already created with 1. Find the nearest villages)

In [35]:
# Load data
treatment_path = os.path.join(root_path, 'treatment_villages.shp')  # Treatment villag
treatment_villages_gdf = gpd.read_file(treatment_path)

# Create a new column to identify the treatment villages in further merge processes 
treatment_villages_gdf['Treatment'] = 1

In [38]:
treatment_villages_gdf.head()

Unnamed: 0,CF_CODE,PHUMCODE,VILLAGE,Criterion,geometry,Treatment
0,1,1040507.0,paoy ta paen,Matching,POINT (300892.976 1504313.876),1
1,2,1040210.0,phnum chonhcheang,Matching,POINT (301955.968 1507006.854),1
2,5,1080106.0,prasat vien,Matching,POINT (307392.925 1533013.65),1
3,6,1030209.0,koun klaeng,Matching,POINT (316052.857 1538343.608),1
4,11,1040206.0,chroab thmei,Matching,POINT (304592.948 1512313.813),1


In [40]:
treatment_villages_gdf["cf_id"] = treatment_villages_gdf["CF_CODE"].astype(str).str.lstrip("0")
treatment_villages_gdf.head()

Unnamed: 0,CF_CODE,PHUMCODE,VILLAGE,Criterion,geometry,Treatment,cf_id
0,1,1040507.0,paoy ta paen,Matching,POINT (300892.976 1504313.876),1,1
1,2,1040210.0,phnum chonhcheang,Matching,POINT (301955.968 1507006.854),1,2
2,5,1080106.0,prasat vien,Matching,POINT (307392.925 1533013.65),1,5
3,6,1030209.0,koun klaeng,Matching,POINT (316052.857 1538343.608),1,6
4,11,1040206.0,chroab thmei,Matching,POINT (304592.948 1512313.813),1,11


### Original Approach
Find the villages closest to either a CF or an FA, and consider them control. 

In [46]:
# 1. Find treatment villages (nearest to each CF)
control_villages = []

for _, f_geom in forests.iterrows():
    nearest_village = find_nearest(vlgs2, f_geom.geometry)
    control_villages.append(nearest_village)

In [49]:
control_villages_gdf = gpd.GeoDataFrame(control_villages, crs=crs)

In [None]:
dises_folder_path = '/Users/Daniel/Library/CloudStorage/GoogleDrive-dwiesner@sig-gis.com/My Drive/DISES/' #https://drive.google.com/open?id=1-FgWgnPlmFZhazQBmGNSZxt3Iz948OUr&usp=drive_fs
control_path = os.path.join(dises_folde_path, 'batched-predictions-branch/data/KHM/treatment_contor_proximity/control_vlgs.shp') #https://drive.google.com/open?id=1zPglJZi1yu5N3HsYTpTptz9-U0nnUqlE&usp=drive_fs
control_villages_gdf.to_file(control_path)

In [75]:
#Create a gdf with treatment and control villages only
#treatment_villages_gdf['Treatment'] = 1
control_villages_gdf['Treatment'] = 0

vlgs3 = pd.concat([treatment_villages_gdf, control_villages_gdf])

vlgs3 = vlgs3.reset_index(drop=True)

vlgs3 = vlgs3[['NUMBER', 'PHUMCODE', 'VILLAGE', 'geometry',
       'Treatment']]

#### Calculate distances from treatment and control villages

In [77]:
vlgs3 = calculate_distances(vlgs3, cfs, 'dist_cf_m')

In [79]:
vlgs3 = calculate_distances(vlgs3, fas, 'dist_fa_m')

In [80]:
vlgs3 = calculate_distances(vlgs3, forests, 'dist_forest_m')

In [89]:
vlgs3.drop(['NUMBER', 'PHUMCODE'], axis=1).groupby('Treatment').describe().round(2).transpose()

Unnamed: 0,Treatment,0,1
dist_cf_m,count,982.0,598.0
dist_cf_m,mean,9852.79,1601.89
dist_cf_m,std,17790.72,1364.02
dist_cf_m,min,0.0,0.0
dist_cf_m,25%,1838.67,667.67
dist_cf_m,50%,3015.86,1268.91
dist_cf_m,75%,9182.92,2169.25
dist_cf_m,max,102656.87,9613.98
dist_fa_m,count,982.0,598.0
dist_fa_m,mean,16360.28,23963.89


## New Approaches

### Approach A

From the CFs, pick villages that are at least 5km away but less than 10kms away as control

In [40]:
%%time
# Step 1: Create buffers for 5 km and 10 km
cfs_buffer_5km = cfs.copy()
cfs_buffer_5km['geometry'] = cfs_buffer_5km.geometry.buffer(5000)

cfs_buffer_10km = cfs.copy()
cfs_buffer_10km['geometry'] = cfs_buffer_10km.geometry.buffer(10000)

CPU times: user 382 ms, sys: 33 ms, total: 415 ms
Wall time: 439 ms


In [41]:
# Step 2: Find villages within the 10 km buffer
villages_within_10km = gpd.sjoin(vlgs2, cfs_buffer_10km, predicate='within')

In [42]:
# Step 3: Find villages within the 5 km buffer
villages_within_5km = gpd.sjoin(vlgs2, cfs_buffer_5km, predicate='within')

In [43]:
# Step 4: Exclude villages within the 5 km buffer from those within 10 km
control_villages_appch2 = villages_within_10km[~villages_within_10km['PHUMCODE'].isin(villages_within_5km['PHUMCODE'])].reset_index(drop=True)

In [44]:
# Step 5: Calculate distance from the village to the each community forest whose buffer contains the village
    # This seeks to select the CFs that is closest to each village 

    # Merge the two geometries (villages and community forests)
control_villages_appch2 = control_villages_appch2.merge(cfs[['cf_id','geometry']], on="cf_id", suffixes=("_village", "_forest"))

    # Convert to GeoDataFrame 
control_villages_appch2 = gpd.GeoDataFrame(control_villages_appch2, geometry="geometry_village")

    # Calculate distance between both geometries 
control_villages_appch2["distance_to_forest"] = control_villages_appch2["geometry_village"].distance(gpd.GeoSeries(control_villages_appch2["geometry_forest"], crs=control_villages_appch2.crs))

In [45]:
# Step 6: Keep the village with the shortest distance to a CF

control_villages_appch2 = control_villages_appch2.sort_values(by=["cf_id", "distance_to_forest"]).drop_duplicates(subset='PHUMCODE', keep='first').reset_index(drop=True)

In [46]:
# Step 7: Arrange dataset

    # Add treatment village id
control_villages_appch2 = control_villages_appch2.merge(cf_id_dict, on='cf_id', how='left')

    # Select only necessary variables 

control_villages_appch2 = control_villages_appch2[['PHUMCODE', 'VILLAGE', 'cf_id', 'distance_to_forest', 'treatm_id', 'geometry_village']].copy()

    # Rename column to standardize
control_villages_appch2.rename({'geometry_village':'geometry', 'distance_to_forest':'dist_forst'}, axis=1, inplace=True)

    # Convert to GeoDataFrame 
control_villages_appch2 = gpd.GeoDataFrame(control_villages_appch2, geometry="geometry", crs=vlgs.crs)

#### Calculate distances from treatment and control villages

In [47]:
#Create a gdf with treatment and control villages only
control_villages_appch2['Treatment'] = 0

vlgs_appch2 = pd.concat([treatment_villages_gdf, control_villages_appch2])

vlgs_appch2 = vlgs_appch2.reset_index(drop=True)

vlgs_appch2 = vlgs_appch2[['PHUMCODE', 'VILLAGE','Treatment','treatm_id', 'cf_id','geometry']]

In [48]:
%%time
# Calculate the distance between the village and the community forest
vlgs_appch2 = calculate_distances(vlgs_appch2, cfs, 'dist_cf_m')

# Calculate the distance between the village and the FAS
vlgs_appch2 = calculate_distances(vlgs_appch2, fas, 'dist_fa_m')

# Calculate the distance between the village and any type of forest
vlgs_appch2 = calculate_distances(vlgs_appch2, forests, 'dist_forest_m')

CPU times: user 1min 13s, sys: 429 ms, total: 1min 14s
Wall time: 1min 15s


In [49]:
vlgs_appch2.drop(['PHUMCODE', 'cf_id', 'treatm_id'], axis=1).groupby('Treatment').describe().round(2).transpose()

Unnamed: 0,Treatment,0,1
dist_cf_m,count,2486.0,598.0
dist_cf_m,mean,7288.56,1601.89
dist_cf_m,std,1429.25,1364.02
dist_cf_m,min,4995.66,0.0
dist_cf_m,25%,6039.79,667.67
dist_cf_m,50%,7158.69,1268.91
dist_cf_m,75%,8488.22,2169.25
dist_cf_m,max,9992.29,9613.98
dist_fa_m,count,2486.0,598.0
dist_fa_m,mean,41097.08,23963.89


#### CPA
CPA tag (dummy) and have distances of treatment and control villages to CPAs

In [45]:
# Step 1: Sjoin of villages and CPAs
cpa_apprch2 = gpd.sjoin(vlgs_appch2, cpa[['CPA_id', 'CPAName_Eg', 'geometry']], predicate='within')

In [46]:
# Step 2: Create a dummy to villages that are within CPAs

vlgs_appch2['CPA_dummy'] = np.where(vlgs_appch2['PHUMCODE'].isin(cpa_apprch2['PHUMCODE']), 1, 0)

In [47]:
# Step 3: Calculate distances

vlgs_appch2 = calculate_distances(vlgs_appch2, cpa, 'dist_cpa_m')

In [48]:
vlgs_appch2['treatm_id'] = vlgs_appch2['treatm_id'].fillna(vlgs_appch2['PHUMCODE'])

#### Export files apprch 2

In [247]:
#cfs_buffer_10km.to_file(outputs_path + '/cfs_buffer_10km/cfs_buffer_10km.shp')

#cfs_buffer_5km.to_file(outputs_path + '/cfs_buffer_5km/cfs_buffer_5km.shp')

#control_villages_appch2.to_file(outputs_path + '/control_villages_appch2/control_villages_appch2.shp')

vlgs_appch2.to_file(outputs_path + '/control_villages_appchA/vlgs_appchA.shp')


  vlgs_appch2.to_file(outputs_path + '/control_villages_appchA/vlgs_appchA.shp')


### Approach B

Pool all villages within 5km of CFs as control (CFs)

In [49]:
# Step 1: Perform a spatial join to find villages within the buffer
control_villages_appch3 = gpd.sjoin(vlgs2, cfs_buffer_5km, predicate='within')

In [50]:
# Step 2: Calculate distance from the village to the each community forest whose buffer contains the village

    # Merge the two geometries (villages and community forests)
control_villages_appch3 = control_villages_appch3.merge(cfs[['cf_id','geometry']], on="cf_id", suffixes=("_village", "_forest"))

    # Convert to GeoDataFrame 
control_villages_appch3 = gpd.GeoDataFrame(control_villages_appch3, geometry="geometry_village")

    # Calculate distance between both geometries 
control_villages_appch3["distance_to_forest"] = control_villages_appch3["geometry_village"].distance(gpd.GeoSeries(control_villages_appch3["geometry_forest"], crs=control_villages_appch3.crs))

In [51]:
# Step 6: Drop duplicated villages with the shortest distance to a CF

control_villages_appch3 = control_villages_appch3.sort_values(by=["cf_id", "distance_to_forest"]).drop_duplicates(subset='PHUMCODE', keep='first').reset_index(drop=True)

In [52]:
# Add treatment village id
control_villages_appch3 = control_villages_appch3.merge(cf_id_dict, on='cf_id', how='left')

# Select only necessary variables 

control_villages_appch3 = control_villages_appch3[['PHUMCODE', 'VILLAGE', 'cf_id', 'distance_to_forest', 'treatm_id', 'geometry_village']].copy()

# Rename column to standardize
control_villages_appch3.rename({'geometry_village':'geometry', 'distance_to_forest':'dist_forst'}, axis=1, inplace=True)

In [53]:
    # Convert to GeoDataFrame 
control_villages_appch3 = gpd.GeoDataFrame(control_villages_appch3, geometry="geometry", crs=vlgs.crs)

#### Calculate distances from treatment and control villages

In [54]:
#Create a gdf with treatment and control villages only
control_villages_appch3['Treatment'] = 0

vlgs_appch3 = pd.concat([treatment_villages_gdf, control_villages_appch3])

vlgs_appch3 = vlgs_appch3.reset_index(drop=True)

vlgs_appch3 = vlgs_appch3[['PHUMCODE', 'VILLAGE','Treatment','treatm_id', 'cf_id','geometry']]

In [55]:
%%time
# Calculate the distance between the village and the community forest
vlgs_appch3 = calculate_distances(vlgs_appch3, cfs, 'dist_cf_m')

# Calculate the distance between the village and the FAS
vlgs_appch3 = calculate_distances(vlgs_appch3, fas, 'dist_fa_m')

# Calculate the distance between the village and any type of forest
vlgs_appch3 = calculate_distances(vlgs_appch3, forests, 'dist_forest_m')

CPU times: user 1min 4s, sys: 497 ms, total: 1min 4s
Wall time: 1min 6s


In [57]:
vlgs_appch3.drop(['PHUMCODE', 'cf_id', 'treatm_id'], axis=1).groupby('Treatment').describe().round(2).transpose()

Unnamed: 0,Treatment,0,1
dist_cf_m,count,2045.0,598.0
dist_cf_m,mean,3080.04,1601.89
dist_cf_m,std,1224.48,1364.02
dist_cf_m,min,0.0,0.0
dist_cf_m,25%,2191.48,667.67
dist_cf_m,50%,3157.62,1268.91
dist_cf_m,75%,4133.99,2169.25
dist_cf_m,max,4998.12,9613.98
dist_fa_m,count,2045.0,598.0
dist_fa_m,mean,35880.99,23963.89


#### CPA

In [58]:
# Step 1: Sjoin of villages and CPAs

cpa_apprch3 = gpd.sjoin(vlgs_appch3, cpa[['CPA_id', 'CPAName_Eg', 'geometry']], predicate='within')

In [59]:
# Step 2: Create a dummy to villages that are within CPAs
 
vlgs_appch3['CPA_dummy'] = np.where(vlgs_appch3['PHUMCODE'].isin(cpa_apprch3['PHUMCODE']), 1, 0)

In [60]:
# Step 3: Calculate distances

vlgs_appch3 = calculate_distances(vlgs_appch3, cpa, 'dist_cpa_m')

In [61]:
vlgs_appch3['treatm_id'] = vlgs_appch3['treatm_id'].fillna(vlgs_appch3['PHUMCODE'])

#### Export files apprch 3

In [None]:
#control_villages_appch3.to_file(outputs_path + '/control_villages_appch3/control_villages_appch3.shp')

vlgs_appch3.to_file(os.path.join(outputs_path, '/control_villages_appchB/vlgs_appchB.shp'))

  vlgs_appch3.to_file(outputs_path + '/control_villages_appchB/vlgs_appchB.shp')


### Approach C

All villages that are within 5km of FA but not close to CFs (<10 km)

In [62]:
%%time
# Step 1: Simplify the geometries and create a 5km buffer around the forests 
fas_buffer_5km = fas.copy()
# 1.1 Simplify geometry to reduce the calculations
fas_buffer_5km['geometry'] = fas_buffer_5km['geometry'].simplify(tolerance=50, preserve_topology=True)
# 1.2 Create buffer for the FAS
fas_buffer_5km['geometry'] = fas_buffer_5km.geometry.buffer(5000)

CPU times: user 1min 17s, sys: 50.3 s, total: 2min 7s
Wall time: 4min 28s


In [63]:
fas_buffer_5km['FID'] = fas_buffer_5km['FID'].astype(str)
fas['FID'] = fas['FID'].astype(str)

In [64]:
# Step 2: Select villages within 5 km of forest areas
villages_within_fas = gpd.sjoin(vlgs2, fas_buffer_5km, predicate='within')

In [65]:
# Step 3: Select villages outside the 10 km buffer of community forests
villages_near_community = gpd.sjoin(vlgs2, cfs_buffer_10km, predicate='within', how='right')
villages_outside_community_buffer = vlgs2[~vlgs2['PHUMCODE'].isin(list(villages_near_community['PHUMCODE']))]

In [66]:
# Step 4: Combine the conditions
control_villages_appch4 = villages_within_fas[villages_within_fas['PHUMCODE'].isin(villages_outside_community_buffer['PHUMCODE'])].reset_index(drop=True)

In [67]:
# Step 5: Calculate distance from the village to the each Fas whose buffer contains the village

    # Merge the two geometries (villages and community forests)
control_villages_appch4 = control_villages_appch4.merge(fas, on="FID", suffixes=("_village", "_fas"))

    # Convert to GeoDataFrame 
control_villages_appch4 = gpd.GeoDataFrame(control_villages_appch4, geometry="geometry_village")

    # Calculate distance between both geometries 
control_villages_appch4["distance_to_forest"] = control_villages_appch4["geometry_village"].distance(gpd.GeoSeries(control_villages_appch4["geometry_fas"], crs=control_villages_appch4.crs))

In [68]:
# Step 6: Drop duplicated villages with the shortest distance to a CF

control_villages_appch4 = control_villages_appch4.sort_values(by=["FID", "distance_to_forest"]).drop_duplicates(subset='PHUMCODE', keep='first').reset_index(drop=True)

In [69]:
# Select only necessary variables 

control_villages_appch4 = control_villages_appch4[['PHUMCODE', 'VILLAGE', 'FID', 'distance_to_forest', 'geometry_village']].copy()

# Rename column to standardize
control_villages_appch4.rename({'geometry_village':'geometry', 'distance_to_forest':'dist_fas'}, axis=1, inplace=True)

In [70]:
    # Convert to GeoDataFrame 
control_villages_appch4 = gpd.GeoDataFrame(control_villages_appch4, geometry="geometry", crs=vlgs.crs)

#### Calculate distances from treatment and control villages

In [71]:
#Create a gdf with treatment and control villages only
control_villages_appch4['Treatment'] = 0

vlgs_appch4 = pd.concat([treatment_villages_gdf, control_villages_appch4])

vlgs_appch4 = vlgs_appch4.reset_index(drop=True)

vlgs_appch4 = vlgs_appch4[['PHUMCODE', 'VILLAGE','Treatment', 'FID', 'dist_fas', 'geometry']]

In [72]:
%%time
# Calculate the distance between the village and the community forest
vlgs_appch4 = calculate_distances(vlgs_appch4, cfs, 'dist_cf_m')

# Calculate the distance between the village and the FAS
vlgs_appch4 = calculate_distances(vlgs_appch4, fas, 'dist_fa_m')

# Calculate the distance between the village and any type of forest
vlgs_appch4 = calculate_distances(vlgs_appch4, forests, 'dist_forest_m')

CPU times: user 19.2 s, sys: 184 ms, total: 19.4 s
Wall time: 19.8 s


In [73]:
vlgs_appch4.drop(['PHUMCODE', 'FID','dist_fas'], axis=1).groupby('Treatment').describe().round(2).transpose()

Unnamed: 0,Treatment,0,1
dist_cf_m,count,185.0,598.0
dist_cf_m,mean,41093.71,1601.89
dist_cf_m,std,26257.7,1364.02
dist_cf_m,min,10154.26,0.0
dist_cf_m,25%,18376.33,667.67
dist_cf_m,50%,34486.94,1268.91
dist_cf_m,75%,54050.3,2169.25
dist_cf_m,max,102656.87,9613.98
dist_fa_m,count,185.0,598.0
dist_fa_m,mean,2903.38,23963.89


#### CPA

In [74]:
# Step 1: Sjoin of villages and CPAs

cpa_apprch4 = gpd.sjoin(vlgs_appch4, cpa[['CPA_id', 'CPAName_Eg', 'geometry']], predicate='within')

In [75]:
# Step 2: Create a dummy to villages that are within CPAs

vlgs_appch4['CPA_dummy'] = np.where(vlgs_appch4['PHUMCODE'].isin(cpa_apprch4['PHUMCODE']), 1, 0)

In [76]:
# Step 3: Calculate distances

vlgs_appch4 = calculate_distances(vlgs_appch4, cpa, 'dist_cpa_m')

In [77]:
vlgs_appch4.rename({'FID':'FID_fas'}, axis=1, inplace=True)

#### Export files apprch 4

In [None]:
#control_villages_appch4.to_file(outputs_path + '/control_villages_appch4/control_villages_appch4.shp')

vlgs_appch4.to_file(os.path.join(outputs_path, '/control_villages_appchC/vlgs_appchC.shp'))

#fas_buffer_5km.to_file(outputs_path + '/fas_buffer_5km/fas_buffer_5km.shp')


  vlgs_appch4.to_file(outputs_path + '/control_villages_appchC/vlgs_appchC.shp')
