# Create vector datasets for cement, steel, and landcover classes

This notebook creates points centered on cement and steel plants. It also
creates landcover classes offset from the cement plants for use as negative examples for the macro-localization model training.

## Import libraries

In [None]:
import os
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd

## Define input and output files

* Input: Cement and steel plants (v4.1) with exact locations in China (csv)
* Output: Points centered on cement, steel, and landcover (v4.1) for use in macro-localization model building

In [None]:
cement_input_csv = '../../resources/asset-subsets-v4p1/cement_exact_china_v4.1.csv'
steel_input_csv = '../../resources/asset-subsets-v4p1/steel_exact_china_v4.1.csv'

In [None]:
if "macro-loc-model-build" not in os.listdir("../../resources"):
    os.mkdir("../../resources/macro-loc-model-build")

In [None]:
cement_output_gjson = "../../resources/macro-loc-model-build/cement_exact_china_v4.1.geojson"
steel_output_gjson = "../../resources/macro-loc-model-build/steel_exact_china_v4.1.geojson"
lc_output_gjson = "../../resources/macro-loc-model-build/lc_exact_china_v4.1.geojson"

## Define offset and buffer sizes, and CRS for calculations

* `offset_size`: distance from cement or steel plants to create landcover chips from
* `buffer_size`: 0.5*length of chips centered on steel, cement, or landcover
* `calc_crs`: coordinate system in m to use for buffer, offset calculations

Landsat-8 chips in macro-localization model will be 1.2 km on a side. Sentinel-2 chips will be 3 km on a side. To create landcover chips that don't overlap, we offset landcover chip centers a distance of `offset_size` from the center of the plants. Then we drop any that fall within `buffer_size` of another plant.

In [None]:
offset_size = 5000 # in m
buffer_size = 1500 # in m
calc_crs = "EPSG:3395"

## Create cement plant geojson file

In [None]:
cement_df = pd.read_csv(cement_input_csv, index_col=False)

### Statistics and quality checks

In [None]:
# Number of plants
tot_cnt = len(cement_df)
print("Count of cement plants: ", tot_cnt)

# Number of plants with null positions
null_cnt = sum(cement_df['latitude'].isnull())
print("Count of cement plants will null position values: ", null_cnt)

### Create GeoDataFrame (define geometry and crs)

In [None]:
cement_gdf = gpd.GeoDataFrame(cement_df.uid,
                              geometry=gpd.points_from_xy(cement_df.longitude, 
                                                          cement_df.latitude),
                              crs="EPSG:4326")

### Write GeoJson file

In [None]:
cement_gdf.to_file(cement_output_gjson, driver='GeoJSON')

## Create steel plant geojson file

In [None]:
steel_df = pd.read_csv(steel_input_csv, index_col=False)

### Statistics and quality checks

In [None]:
# Number of plants
tot_cnt = len(steel_df)
print("Count of steel plants: ", tot_cnt)

# Number of plants with null positions
null_cnt = sum(steel_df['latitude'].isnull())
print("Count of steel plants will null position values: ", null_cnt)

### Create GeoDataFrame (define geometry and crs)

In [None]:
steel_gdf = gpd.GeoDataFrame(steel_df.uid,
                             geometry=gpd.points_from_xy(steel_df.longitude, 
                                                         steel_df.latitude),
                              crs="EPSG:4326")

### Write GeoJson file

In [None]:
steel_gdf.to_file(steel_output_gjson, driver='GeoJSON')

## Create landcover geojson

* Take cement and steel plant locations, convert to CRS in m
* Create large geometry around plant locations using `offset_size`
* Find the lat/long points at the four-most corners of the boundary to use for landcover centers
* Remove landcover boundaries that would intersect with cement and steel boundaries in chipping

#### Convert cement and steel plant geometries to EPSG:3395

In [None]:
plant_gdf = cement_gdf[["geometry"]].append(steel_gdf[["geometry"]])
plant_gdf = plant_gdf.to_crs(calc_crs)

#### Create large polygons around plants

In [None]:
plant_offset_gdf = gpd.GeoDataFrame(geometry=plant_gdf.buffer(offset_size).envelope,
                                    crs=calc_crs)

#### Get coordinates of the outer corners

In [None]:
point_list = []
ids = []
for index, row in plant_offset_gdf.iterrows():
    coords = list(row['geometry'].exterior.coords)
    for item in coords:
        points = Point(item)
        point_list.append(points)
        ids.append(index)

#### Create landcover GeoDataFrame in EPSG:3395

In [None]:
lc_df = pd.DataFrame({"uid": ["CHN"+str(row).zfill(4) for row in list(range(1,len(point_list)+1))],
                      "geometry": point_list})
lc_gdf = gpd.GeoDataFrame(lc_df,
                          crs=calc_crs)

#### Exclude landcover points that would intersect with plant chips

In [None]:
lc_poly_df = pd.DataFrame({"uid": lc_gdf.uid,
                           "geom_points": lc_gdf.geometry,
                           "geometry": lc_gdf.buffer(buffer_size).envelope})
lc_poly_gdf = gpd.GeoDataFrame(lc_poly_df,
                               crs=calc_crs)

In [None]:
plant_poly_df = pd.DataFrame({"geom_points": plant_gdf.geometry,
                              "geometry": plant_gdf.buffer(buffer_size).envelope})
plant_poly_gdf = gpd.GeoDataFrame(plant_poly_df,
                                  crs=calc_crs)

In [None]:
lc_intrsct_gdf = gpd.sjoin(lc_poly_gdf, plant_poly_gdf, how='inner', op='intersects')
lc_gdf = lc_gdf[~lc_gdf.uid.isin(lc_intrsct_gdf.uid)]

#### Convert to EPSG:4346 and write to GeoJSON

In [None]:
lc_gdf = lc_gdf.to_crs("EPSG:4326")
lc_gdf.to_file(lc_output_gjson, driver='GeoJSON')