# Create vector datasets for cement, steel, and landcover classes

This notebook creates points for chip centers that contain known cement and steel plants. It also
creates landcover classes offset from the plants for use as negative examples for the macro-localization model training.

## Import libraries

In [None]:
import os
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd
import numpy as np

## Define input and output files

* Input: Cement and steel plants (v4.1) with exact locations in China (csv)
* Output:
    * Points centered on cement and steel plants, with full attributes
    * Chip centers and extents for cement, steel, and landcover classes for use in macro-localization model building

### Input files

In [None]:
cement_input_csv = '../../resources/asset-subsets-v4p1/cement_exact_china_v4.1.csv'
steel_input_csv = '../../resources/asset-subsets-v4p1/steel_exact_china_v4.1.csv'

### Output files

In [None]:
output_path = '../../resources/macro-loc-model-build4'

In [None]:
if not os.path.exists(output_path):
    os.mkdir(output_path)

In [None]:
cement_output_gjson = ouput_path+"/cement_exact_china_v4.1_s2.geojson"
steel_output_gjson = output_path+"/steel_exact_china_v4.1_s2.geojson"

In [None]:
cement_chip_cntr_gjson = output_path+"/cement_chip_cntr_china_v4.1_s2.geojson"
cement_chip_ext_gjson = output_path+"/cement_chip_ext_china_v4.1_s2.geojson"

steel_chip_cntr_gjson = output_path+"/steel_chip_cntr_china_v4.1_s2.geojson"
steel_chip_ext_gjson = output_path+"/steel_chip_ext_china_v4.1_s2.geojson"

landcover_chip_cntr_gjson = output_path+"/landcover_chip_cntr_china_v4.1_s2.geojson"
landcover_chip_ext_gjson = output_path+"/landcover_chip_ext_china_v4.1_s2.geojson"

## Define parameters

* `buffer_size`: 0.5*length of chips centered on steel, cement, or landcover
* `calc_crs`: coordinate system in m to use for buffer, offset calculations
* `n_chips`: number of chips per plant to create; landcover class creates 8 chips per plant

In [None]:
buffer_size = 1500 # in m
calc_crs = "EPSG:3395"
n_chips = 4

## Create cement plant geojson file

In [None]:
cement_pdf = pd.read_csv(cement_input_csv, index_col=False)

### Statistics and quality checks

In [None]:
# Number of plants
tot_cement_cnt = len(cement_pdf)
print("Count of cement plants: ", tot_cement_cnt)

# Number of plants with null positions
null_cnt = sum(cement_pdf['latitude'].isnull())
print("Count of cement plants will null position values: ", null_cnt)

### Create GeoDataFrame (define geometry and crs)

In [None]:
cement_gdf = gpd.GeoDataFrame(cement_pdf,
                              geometry=gpd.points_from_xy(cement_pdf.longitude, 
                                                          cement_pdf.latitude),
                              crs="EPSG:4326")

### Write GeoJson file

In [None]:
cement_gdf.to_file(cement_output_gjson, driver='GeoJSON')

## Create steel plant geojson file

In [None]:
steel_pdf = pd.read_csv(steel_input_csv, index_col=False)

### Statistics and quality checks

In [None]:
# Number of plants
tot_steel_cnt = len(steel_pdf)
print("Count of steel plants: ", tot_steel_cnt)

# Number of plants with null positions
null_cnt = sum(steel_pdf['latitude'].isnull())
print("Count of steel plants will null position values: ", null_cnt)

### Create GeoDataFrame (define geometry and crs)

In [None]:
steel_gdf = gpd.GeoDataFrame(steel_pdf,
                             geometry=gpd.points_from_xy(steel_pdf.longitude, 
                                                         steel_pdf.latitude),
                              crs="EPSG:4326")

### Write GeoJson file

In [None]:
steel_gdf.to_file(steel_output_gjson, driver='GeoJSON')

## Create Cement Chip Centroids

### Convert to physical crs

In [None]:
cement_phys_gdf = cement_gdf.to_crs(calc_crs)
cement_phys_gdf = cement_phys_gdf[['uid', 'geometry']]

### Create chip centroids randomly offset from plant center

In [None]:
# Loop over all cement plants
for ind in list(range(0, tot_cement_cnt)):
    
    # Loop over number of chips per plant to create
    for n in list(range(1, n_chips+1)):
        
        # Spread chips centers evenly in four quandrants split by plant center
        if np.mod(n, 4) == 1:
            dx = np.random.uniform(low = 0, high = buffer_size)
            dy = np.random.uniform(low = 0, high = buffer_size)
        if np.mod(n, 4) == 2:
            dx = np.random.uniform(low = -buffer_size, high = 0)
            dy = np.random.uniform(low = 0, high = buffer_size)
        if np.mod(n, 4) == 3:
            dx = np.random.uniform(low = -buffer_size, high = 0)
            dy = np.random.uniform(low = -buffer_size, high = 0)
        if np.mod(n, 4) == 0:
            dx = np.random.uniform(low = 0, high = buffer_size)
            dy = np.random.uniform(low = -buffer_size, high = 0)
        
        # Shift points by random number
        shifted_geom = cement_phys_gdf.iloc[[ind]].geometry.translate(dx, dy)
        shifted_geom_gdf = gpd.GeoDataFrame({'uid': cement_phys_gdf.iloc[[ind]].uid,
                                             'tile_id': cement_phys_gdf.iloc[[ind]].uid+'-'+str(n).zfill(2)},
                                            geometry=shifted_geom,
                                            crs=calc_crs)
        shifted_geom_gdf['dist_m'] = shifted_geom_gdf.distance(cement_phys_gdf.iloc[[ind]])
        
        # Append to new DataFrame
        if 'cement_chip_cntr_gdf' in locals():            
            cement_chip_cntr_gdf = pd.concat([cement_chip_cntr_gdf, shifted_geom_gdf], 
                                             ignore_index=True)
            
        else:
            cement_chip_cntr_gdf = shifted_geom_gdf

### Write chip centroids and extents out to GeoJson

In [None]:
cement_chip_ext_gdf = gpd.GeoDataFrame(cement_chip_cntr_gdf[['uid', 'tile_id', 'dist_m']],
                                       geometry=cement_chip_cntr_gdf.buffer(buffer_size).envelope,
                                       crs=calc_crs)
cement_chip_ext_gdf = cement_chip_ext_gdf.to_crs('EPSG:4326')
cement_chip_ext_gdf.to_file(cement_chip_ext_gjson, driver='GeoJSON')

In [None]:
cement_chip_cntr_gdf = cement_chip_cntr_gdf.to_crs('EPSG:4326')
cement_chip_cntr_gdf.to_file(cement_chip_cntr_gjson, driver='GeoJSON')

## Create Steel Chip Centroids

### Convert to physical crs

In [None]:
steel_phys_gdf = steel_gdf.to_crs(calc_crs)
steel_phys_gdf = steel_phys_gdf[['uid', 'geometry']]

### Create chip centroids randomly offset from plant center

In [None]:
# Loop over all steel plants
for ind in list(range(0, tot_steel_cnt)):
    
    # Loop over number of chips per plant to create
    for n in list(range(1, n_chips+1)):
        
        # Spread chips centers evenly in four quandrants split by plant center
        if np.mod(n, 4) == 1:
            dx = np.random.uniform(low = 0, high = buffer_size)
            dy = np.random.uniform(low = 0, high = buffer_size)
        if np.mod(n, 4) == 2:
            dx = np.random.uniform(low = -buffer_size, high = 0)
            dy = np.random.uniform(low = 0, high = buffer_size)
        if np.mod(n, 4) == 3:
            dx = np.random.uniform(low = -buffer_size, high = 0)
            dy = np.random.uniform(low = -buffer_size, high = 0)
        if np.mod(n, 4) == 0:
            dx = np.random.uniform(low = 0, high = buffer_size)
            dy = np.random.uniform(low = -buffer_size, high = 0)
        
        # Shift points by random number
        shifted_geom = steel_phys_gdf.iloc[[ind]].geometry.translate(dx, dy)
        shifted_geom_gdf = gpd.GeoDataFrame({'uid': steel_phys_gdf.iloc[[ind]].uid,
                                             'tile_id': steel_phys_gdf.iloc[[ind]].uid+'-'+str(n).zfill(2)},
                                            geometry=shifted_geom,
                                            crs=calc_crs)
        shifted_geom_gdf['dist_m'] = shifted_geom_gdf.distance(steel_phys_gdf.iloc[[ind]])
        
        # Append to new DataFrame
        if 'steel_chip_cntr_gdf' in locals():            
            steel_chip_cntr_gdf = pd.concat([steel_chip_cntr_gdf, shifted_geom_gdf], 
                                             ignore_index=True)
            
        else:
            steel_chip_cntr_gdf = shifted_geom_gdf

### Write chip centroids and extents out to GeoJson

In [None]:
steel_chip_ext_gdf = gpd.GeoDataFrame(steel_chip_cntr_gdf[['uid', 'tile_id', 'dist_m']],
                                       geometry=steel_chip_cntr_gdf.buffer(buffer_size).envelope,
                                       crs=calc_crs)
steel_chip_ext_gdf = steel_chip_ext_gdf.to_crs('EPSG:4326')
steel_chip_ext_gdf.to_file(steel_chip_ext_gjson, driver='GeoJSON')

In [None]:
steel_chip_cntr_gdf = steel_chip_cntr_gdf.to_crs('EPSG:4326')
steel_chip_cntr_gdf.to_file(steel_chip_cntr_gjson, driver='GeoJSON')

## Create Landcover Chip Centroids

* Take cement and steel plant locations
* Shift coordinates by large factors of buffer_size, so that get ring of landcover chips surrounding by not including the plant
* Remove landcover chips that would intersect with other cement and steel plants

### Merge cement and steel plant centers

In [None]:
plant_gdf = cement_phys_gdf[['geometry']].append(steel_phys_gdf[['geometry']])

### Create landcover chip centers

In [None]:
dx_fac = [3, 2, 0, -2, -3, -2, 0, 2]
dy_fac = [0, 2, 3, 2, 0, -2, -3, -2]

In [None]:
for dxf, dyf in zip(dx_fac, dy_fac):
        
    # Shift plant locations by buffer size factor
    shifted_geom = plant_gdf.translate(dxf*buffer_size, dyf*buffer_size)
    shifted_geom_gdf = gpd.GeoDataFrame(geometry=shifted_geom,
                                        crs=calc_crs)
        
    # Append to new DataFrame
    if 'landcover_chip_cntr_gdf' in locals():
        landcover_chip_cntr_gdf = pd.concat([landcover_chip_cntr_gdf, shifted_geom_gdf],
                                            ignore_index=True)
            
    else:
        landcover_chip_cntr_gdf = shifted_geom_gdf

In [None]:
landcover_chip_cntr_gdf['tile_id'] = ["CHN"+str(row).zfill(4)+'-LC' for row in list(range(1,len(landcover_chip_cntr_gdf)+1))]

### Compute tile extents and convert to EPSG:4346

In [None]:
landcover_chip_ext_gdf = gpd.GeoDataFrame(landcover_chip_cntr_gdf[['tile_id']],
                                          geometry=landcover_chip_cntr_gdf.buffer(buffer_size).envelope,
                                          crs=calc_crs)
landcover_chip_ext_gdf = landcover_chip_ext_gdf.to_crs('EPSG:4326')

In [None]:
landcover_chip_cntr_gdf = landcover_chip_cntr_gdf.to_crs("EPSG:4326")

### Remove landcover chips that intersect with known plants

In [None]:
lc_intsct_cement_gdf = gpd.sjoin(landcover_chip_ext_gdf, cement_gdf, how='inner', op='intersects')
landcover_chip_ext_gdf = landcover_chip_ext_gdf[~landcover_chip_ext_gdf.tile_id.isin(lc_intsct_cement_gdf.tile_id)]
landcover_chip_cntr_gdf = landcover_chip_cntr_gdf[~landcover_chip_cntr_gdf.tile_id.isin(lc_intsct_cement_gdf.tile_id)]

In [None]:
lc_intsct_steel_gdf = gpd.sjoin(landcover_chip_ext_gdf, steel_gdf, how='inner', op='intersects')
landcover_chip_ext_gdf = landcover_chip_ext_gdf[~landcover_chip_ext_gdf.tile_id.isin(lc_intsct_steel_gdf.tile_id)]
landcover_chip_cntr_gdf = landcover_chip_cntr_gdf[~landcover_chip_cntr_gdf.tile_id.isin(lc_intsct_steel_gdf.tile_id)]

### Write landcover chips to GeoJSON

In [None]:
landcover_chip_cntr_gdf.to_file(landcover_chip_cntr_gjson, driver='GeoJSON')
landcover_chip_ext_gdf.to_file(landcover_chip_ext_gjson, driver='GeoJSON')