In [None]:
from shapely.geometry import shape, mapping
import shapefile
import fiona
import yaml
import geopandas as gpd
from tqdm import tqdm

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections import raster

# v1

In [2]:
shps = ['../Shapefiles/GADM/gadm36_0.shp', '../Shapefiles/GADM/gadm36_1.shp', '../Shapefiles/GADM/gadm36_2.shp']

In [2]:
def get_shape_tuple(shape_record_tuple, i):
    shape, record = shape_record_tuple
    country = record[0]

    adm1, adm2 = None, None
    if i >= 1:
        adm1 = record[2]
    if i == 2:
        adm2 = record[5]
        
    return (shape, country, adm1, adm2)

with open('../Data/Countries_edited.yml') as f:
    countries = yaml.safe_load(f)

codes = {}
for country in countries:
    for code in country['codes']:
        if code in codes:
            codes[code].update(country['codes'])
        else:
            codes[code] = set(country['codes'])

shapes_by_country = {}
for i, shp in enumerate(shps):
    shp = shapefile.Reader(shp)
    shapes = shp.shapes()
    records = shp.records()

    round_shapes_by_country = {}
    for shape, record in tqdm(zip(shapes, records), total=len(shapes), desc=str(i)):
        r = get_shape_tuple((shape, record), i)
        country = r[1]
        
        for code in codes.get(country, [country]):
            if code not in round_shapes_by_country:
                round_shapes_by_country[code] = [r]
            else:
                round_shapes_by_country[code].append(r)
    shapes_by_country.update(round_shapes_by_country)
    del (shapes, records, shp)
    
assert 'GT' in shapes_by_country and 'PM' in shapes_by_country

0: 100%|██████████| 256/256 [00:00<00:00, 228309.98it/s]
1: 100%|██████████| 3610/3610 [00:00<00:00, 4120.06it/s]
2: 100%|██████████| 45962/45962 [00:00<00:00, 359162.36it/s]


In [3]:
names = []
records = []

for name, record in shapes_by_country.items():
    try:
        idx = records.index(record)
    except ValueError:
        names.append([name])
        records.append(record)
    else:
        names[idx].append(name)
        
print(len(records))

276


In [4]:
schema = {
    'geometry': 'Polygon',
    'properties': {'adm0': 'str', 'adm1': 'str', 'adm2': 'str'},
}

for name, record in tqdm(zip(names, records), total=len(names)):
    # Write a new Shapefile
    with fiona.open('../Shapefiles/preprocessed/' + '_'.join(name) + '.shp', 'w', 'ESRI Shapefile', schema) as c:
        for sh, adm0, adm1, adm2 in record:
            c.write({
                'geometry': mapping(sh),
                'properties': {'adm0': adm0, 'adm1': adm1, 'adm2': adm2},
            })

100%|██████████| 276/276 [06:36<00:00,  1.44s/it]


# GeoPandas (SHP) version

In [None]:
# Also save a shp version
adms = [gpd.read_file(shp) for shp in tqdm(shps, desc='reading shps')]

complete = adms[-1]
for adm in tqdm(reversed(adms[:-1]), desc='finding shapes'):
    new_gid = set(adm['GID_0']).difference(complete['GID_0'])
    missing_gid = adm[adm['GID_0'].apply(lambda x: x in new_gid)]
    if not missing_gid.empty:
        complete = complete.append(missing_gid)
    
print(complete.shape)

# Split big shapes
complete['portion'] = None
new_rows = []
for idx, row in tqdm(complete.copy().iterrows(), total=complete.shape[0], desc='Splitting'):
    area = raster.get_bounding_box_area(row['geometry'])
    if area > 10:
        cutted_geometry = raster.quadrat_cut_geometry(row['geometry'], quadrat_width=1)
        for i, geometry in enumerate(cutted_geometry):
            new_row = row.copy()
            new_row['geometry'] = geometry
            new_row['portion'] = i
            new_rows.append(new_row)
        complete.drop(index=idx, inplace=True)
        
if new_rows:
    complete = complete.append(new_rows)
    
print(complete.shape)
print(complete['portion'].max())

complete.to_file("../Shapefiles/preprocessed/all_countries.shp")

reading shps: 100%|██████████| 3/3 [01:11<00:00, 23.93s/it]
finding shapes: 2it [00:00, 11.35it/s]
Splitting:   0%|          | 0/46782 [00:00<?, ?it/s]

(46782, 19)


Splitting: 100%|██████████| 46782/46782 [33:51<00:00, 23.03it/s]  


In [None]:
print(complete.shape)
print(complete['portion'].max())

# With ethnicity

In [2]:
adm = gpd.read_file("../Shapefiles/preprocessed/all_countries.shp")
ethnic = gpd.read_file('../Shapefiles/ethnic_preprocessed/tribe_adm0_s.shp')

adm['id'] = adm['GID_2']
adm['id'].fillna(adm['GID_1'], inplace=True)
adm['id'].fillna(adm['GID_0'], inplace=True)

ethnic['id'] = ethnic['GID_0'] + '__' + ethnic['NAME']

columns = ['id', 'GID_0', 'portion', 'geometry']
all_geo = ethnic[columns].append(adm[columns])
all_geo.to_file("../Shapefiles/preprocessed/all_countries_with_eth.shp")