# EBSA Processing

# Set up

Load modules

In [2]:
from cartoframes.auth import set_default_credentials
from cartoframes import read_carto, to_carto
import geopandas as gpd
import pandas as pd
import os
import logging 
from shapely import geometry
import requests
import re
from bs4 import BeautifulSoup
import glob
from zipfile import ZipFile
import shutil

NumExpr defaulting to 8 threads.


Set up logging

In [2]:
# get top-level logger object
logger = logging.getLogger()
for handler in logger.handlers: logger.removeHandler(handler)
# manually set level 
logger.setLevel(logging.INFO)
# print to console
console = logging.StreamHandler()
logger.addHandler(console)
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

Set data directory

In [9]:
data_dir = "data"

## Download EBSAs using url list from OneDrive

Copy csv of the urls to each EBSA page in the CBD clearing house to data directory

In [10]:
logger.info('Pulling EBSAs from OneDrive')
# copy the csv with the urls for the EBSA jsons
raw_data_file = os.path.join(os.getenv("OCEANWATCH_DATA_DIR"),'EBSA_url_list.csv')
dest_dir = os.path.join(data_dir, os.path.basename(raw_data_file))
shutil.copy(raw_data_file, dest_dir)

Pull EBSAs from OneDrive


'data/EBSA_url_list.csv'

Read the csv as a pandas dataframe

In [11]:
# read in the csv with the urls for the EBSA jsons
url_df = pd.read_csv(raw_data_file,encoding='latin-1')
url_list = url_df['CHM Url']

Scrape the webpage for each EBSA for the url to download the geojson and download it

In [13]:
# regex pattern for the finding a geojson 
match_st = re.compile(r'geojson') 
for url in url_list:
    # scrape the page for the geojson
    r = requests.get(url)  
    c = r.content 
    soup = BeautifulSoup(c)
    for link in soup.findAll('a', attrs={'href': re.compile("geojson$")}):
        href  = link.get('href')
        url = 'https://chm.cbd.int' + href
        # path to the raw data 
        raw_data_file = os.path.join(data_dir, os.path.basename(url))
        # download raw data
        r = requests.get(url)

NameError: name 'url_list' is not defined

## Merge the EBSA polygons into one shapefile

Create list of the geojsons

In [18]:
ebsa_files = glob.glob(os.path.join(data_dir, '*geojson'))
gdf_list = []

Iterate through geojson files to read each as a geodataframe 

In [20]:
for file in ebsa_files:
    try:
        gdf = gpd.read_file(file)
        gdf_list.append(gdf)
    except Exception:
        print("Could not read" + file)
     

`data/SIO_3_EBSA.geojson' not recognized as a supported file format.


Could not readdata/SIO_3_EBSA.geojson


Merge the geodataframes

In [21]:
# merge geopandas dataframes 
gdf_ebsa = gpd.GeoDataFrame(pd.concat(gdf_list))
print ("Length EBSA = " + str(len(gdf_ebsa.index)))

Length EBSA = 479
                                         NAME  \
0    Wrangel-Herald Shallow and Ratmanov Gyre   
0                 The Small Phyllophora Field   
0                            Oman Arabian Sea   
0                         Northeastern Honshu   
0  Sandspit/Hawks Bay and Adjacent Backwaters   

                                          Workshop  EBSA_ID GLOBAL_ID  \
0                                           Arctic       10    ARC_10   
0                        Black Sea and Caspian Sea        6    BSCS_6   
0  North-West Indian Ocean and Adjacent Gulf Areas       29   NWIO_29   
0                                Seas of East Asia       35     EA_35   
0  North-West Indian Ocean and Adjacent Gulf Areas       16   NWIO_16   

                                            geometry  
0  MULTIPOLYGON (((180.00000 70.14896, 179.99272 ...  
0  POLYGON ((33.16825 45.80095, 33.15300 45.90800...  
0  POLYGON ((59.45319 22.62132, 59.47155 22.63917...  
0  MULTIPOLYGON (((141.73138

In [25]:
# save processed dataset to shapefile
gdf_ebsa.to_file('merged_ebsa.shp',driver='ESRI Shapefile')
gdf_ebsa = '/home/rthoms/Github/resource-watch/wri-projects/ocean-watch/processing-scripts/biodiversity-protection/EBSA/merged_ebsa.shp'




## Dissolve the EBSAs

In [4]:
ebsa = '/home/rthoms/Github/resource-watch/wri-projects/ocean-watch/processing-scripts/biodiversity-protection/EBSA/merged_ebsa.shp'
gdf_ebsa = gpd.read_file(ebsa)

Create a field that is the same across all EBSAs to use in the dissolve

In [11]:
print("Length merged EBSA gdf = " + len(gdf_ebsa.index))
gdf_ebsa['dissolve']= 1

271


Use the dissolve field to flatten the entire dataset, avoiding any double counting

In [12]:
# dissolve
gdf_ebsa = gdf_ebsa.dissolve('dissolve')
print ("Length dissolved EBSA gdf = " + str(len(gdf_ebsa.index)))




Length EBSA = 1


In [14]:
# save processed dataset to shapefile
gdf_ebsa['NAME'] = 'EBSA'
gdf_ebsa['Workshop'] = 'EBSA'
gdf_ebsa['EBSA_ID'] = 0
gdf_ebsa['GLOBAL_ID'] = 0
gdf_ebsa.to_file('dissolved_ebsa.shp',driver='ESRI Shapefile')