# one option is multiprocessing and dealing with each value separately

In [1]:
import pandas as pd
import folium
import geopandas as gpd
from shapely.geometry import Polygon
import rasterio
import numpy as np
from rasterio.warp import calculate_default_transform, reproject, Resampling
import json
from rasterio.features import shapes
import multiprocessing
from multiprocessing import get_context

In [12]:
def test_multiprocess(habitat_number):
    '''processed_data'''
    with open(f"testfile_{habitat_number}.txt","w") as f:
        str_to_write = "hello world"
        f.write(str_to_write)

def create_one_habitat_raster_to_poly(habitat_number):
    '''
    '''
    print(f"working on habitat_number {habitat_number}")
    src = rasterio.open('processed_data/statewidehabitatmap.wgs84.tif')
    raster_arr = src.read(1)
    raster_arr_int = raster_arr.astype('int16')
    
    mask = raster_arr == habitat_number

    habitat_shapes = shapes(raster_arr_int, mask=mask, transform=src.transform)
    habitat_polygon_geo_list = [Polygon(geo[0]['coordinates'][0]) for geo in habitat_shapes]
    habitat_arr = np.ones(len(habitat_polygon_geo_list))*habitat_number
    habitat_gpd = gpd.GeoDataFrame(pd.DataFrame(data=habitat_arr, columns=['habitat_number'])
                                    ,crs=4326
                                    ,geometry=habitat_polygon_geo_list)
    # all writing to the same geopackage and different layers did not work, wonder if it's unable to write while another has it open?
    habitat_gpd.to_file("processed_data/statewidehabitat.gpkg",driver='GPKG',layer=f'habitat_{habitat_number}')
    # habitat_gpd.to_file(f"processed_data/statewidehabitat_{habitat_number}.geojson",driver='GeoJSON')
    

In [3]:
habitat_number_list = [habitat_number for habitat_number in range(1,78)]

In [13]:
cpu_count = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=cpu_count)

In [11]:
result = pool.map(create_one_habitat_raster_to_poly, habitat_number_list[12:14])

# notes
- think about ordering list baed on "frequency of counts"
- make this a git repo
- lower resolution solution

# test with one habitat

In [58]:
#shapes for habitat 1 
#based on docs --https://rasterio.readthedocs.io/en/latest/topics/features.html

habitat_number = 1
mask = raster_arr == habitat_number

habitat_shapes = shapes(raster_arr_int, mask=mask, transform=src.transform)

In [59]:
habitat_shapes_list = list(habitat_shapes)

In [60]:
habitat_shapes_list[0]

({'type': 'Polygon',
  'coordinates': [[(-121.85162227591015, 45.6886764869563),
    (-121.85162227591015, 45.68833291878586),
    (-121.85093513956929, 45.68833291878586),
    (-121.85093513956929, 45.6886764869563),
    (-121.85162227591015, 45.6886764869563)]]},
 1.0)

In [61]:
[geo[0]['coordinates'] for geo in habitat_shapes_list[:2]]

[[[(-121.85162227591015, 45.6886764869563),
   (-121.85162227591015, 45.68833291878586),
   (-121.85093513956929, 45.68833291878586),
   (-121.85093513956929, 45.6886764869563),
   (-121.85162227591015, 45.6886764869563)]],
 [[(-121.79458995961834, 45.666344555878176),
   (-121.79458995961834, 45.66600098770775),
   (-121.7942463914479, 45.66600098770775),
   (-121.7942463914479, 45.666344555878176),
   (-121.79458995961834, 45.666344555878176)]]]

In [62]:
habitat_polygon_geo_list = [Polygon(geo[0]['coordinates'][0]) for geo in habitat_shapes_list]

In [66]:
habitat_arr = np.ones(len(habitat_polygon_geo_list))*habitat_number

In [70]:
habitat_gpd = gpd.GeoDataFrame(pd.DataFrame(data=habitat_arr, columns=['habitat_number']),crs=4326,geometry=habitat_polygon_geo_list)

In [71]:
habitat_gpd.to_file("processed_data/statewidehabitat.gpkg",driver='GPKG',layer=f'habitat_{habitat_number}')

In [73]:
#loop through habitat numbers
# we know there are 77
#based on docs --https://rasterio.readthedocs.io/en/latest/topics/features.html

for habitat_number in range(1,78):
    print(f"working on habitat_number {habitat_number}")
    mask = raster_arr == habitat_number

    habitat_shapes = shapes(raster_arr_int, mask=mask, transform=src.transform)
    habitat_polygon_geo_list = [Polygon(geo[0]['coordinates'][0]) for geo in habitat_shapes]
    habitat_arr = np.ones(len(habitat_polygon_geo_list))*habitat_number
    habitat_gpd = gpd.GeoDataFrame(pd.DataFrame(data=habitat_arr, columns=['habitat_number'])
                                    ,crs=4326
                                    ,geometry=habitat_polygon_geo_list)
    habitat_gpd.to_file("processed_data/statewidehabitat.gpkg",driver='GPKG',layer=f'habitat_{habitat_number}')

working on habitat_number 1
working on habitat_number 2
working on habitat_number 3


KeyboardInterrupt: 

## can we parallelize the above?