## Import required libraries

In [1]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import glob
from tqdm import tqdm
import fiona

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

## Read in Scored Data

In [3]:
s3_path = 'S2-RGB-macro-localization-model-deployment4'
s3_file = 'S2-deployment-chip-scores-CHN-10km-nowater-2020-consolidated.geojson'

cement_site_geojson = "../../resources/macro-loc-model-build/cement_exact_china_v4.1.geojson"
steel_site_geojson = "../../resources/macro-loc-model-build/steel_exact_china_v4.1.geojson"

LOCAL_DIR = '/scratch/'

In [4]:
cement_sites = gpd.read_file(cement_site_geojson)
steel_sites = gpd.read_file(steel_site_geojson)

In [5]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

In [6]:
# Download tar file from S3 and untar
bucket.download_file(s3_path+'/'+s3_file, LOCAL_DIR+s3_file)

In [7]:
gdf = gpd.read_file(LOCAL_DIR+s3_file)

In [8]:
gdf = gdf[['tile_id', 'tile_cmt_prob', 'tile_stl_prob', 'cmtv4p1_uid','stlv4p1_uid', 'geometry']]

In [9]:
gdf.head()

Unnamed: 0,tile_id,tile_cmt_prob,tile_stl_prob,cmtv4p1_uid,stlv4p1_uid,geometry
0,MGRS-51UXS-0001-2020-01,0.085863,6.590203e-07,,,"POLYGON ((124.42466 50.98389, 124.42548 51.010..."
1,MGRS-44SNJ-0010-2020-01,0.171622,8.014212e-07,,,"POLYGON ((81.03460 39.37182, 81.03461 39.39885..."
2,MGRS-44SNJ-0009-2020-01,0.143269,0.0,,,"POLYGON ((80.99977 39.37183, 80.99977 39.39886..."
3,MGRS-44SNJ-0008-2020-01,0.149796,0.0179205,,,"POLYGON ((81.03461 39.39885, 81.03462 39.42589..."
4,MGRS-44SNJ-0007-2020-01,0.018871,2.209537e-13,,,"POLYGON ((80.99977 39.39886, 80.99977 39.42589..."


In [10]:
len(gdf)

415036

## Create Sample Cement Data

In [11]:
# pick 200 cement plants with known locations from chips so they off center - highest probabilities
known_cement_gdf = gdf[gdf.cmtv4p1_uid.notnull()]

cement_df = pd.read_excel("../../resources/asset-datasets-v4/cement_dataset_v4.xlsx")
cement_df = cement_df[['uid', 'plant_type']]

known_cement_gdf = pd.merge(known_cement_gdf, cement_df, left_on='cmtv4p1_uid', right_on='uid')

# half grinding
known_grinding_gdf = known_cement_gdf[known_cement_gdf.plant_type == 'Grinding'].drop('plant_type', axis=1)

# half integrated
known_integrated_gdf = known_cement_gdf[known_cement_gdf.plant_type == 'Integrated'].sort_values('tile_cmt_prob', ascending=False)[:100].drop('plant_type', axis=1)

num_remaining_plants = 1000 - len(known_grinding_gdf) - len(known_integrated_gdf)

unknown_cement_gdf = gdf[gdf.cmtv4p1_uid.isnull()].sort_values('tile_cmt_prob', ascending=False)[:num_remaining_plants]

cement_sample_df = pd.concat([known_grinding_gdf, known_integrated_gdf, unknown_cement_gdf]) \
                     .drop(['tile_stl_prob', 'stlv4p1_uid', 'uid'], axis=1) \
                     .set_crs("EPSG:4326") \
                     .sample(frac=1, random_state=123)

In [12]:
cement_sites = cement_sites[cement_sites.uid.isin(cement_sample_df[cement_sample_df.cmtv4p1_uid.notnull()].cmtv4p1_uid.unique())]
cement_sites = cement_sites.rename(columns={'uid':'cmtv4p1_uid'})

cement_sites.loc[:, 'tile_id'] = None
cement_sites.loc[:, 'tile_cmt_prob'] = None

cement_sites = cement_sites[['tile_id','tile_cmt_prob','cmtv4p1_uid','geometry']]

In [13]:
cement_sample_df = gpd.GeoDataFrame(pd.concat([cement_sample_df, cement_sites], ignore_index=True))

In [14]:
cement_sample_df

Unnamed: 0,tile_id,tile_cmt_prob,cmtv4p1_uid,geometry
0,MGRS-46RBT-0028-2020-01,0.974540,,"POLYGON ((90.89223 29.65126, 90.89167 29.67831..."
1,MGRS-49SFU-0304-2020-01,0.939986,,"POLYGON ((112.52052 34.85484, 112.52102 34.881..."
2,MGRS-48RUT-0174-2020-01,0.949506,CHN0643,"POLYGON ((103.46291 29.49668, 103.46251 29.523..."
3,MGRS-48RWT-0620-2020-03,0.884310,,"POLYGON ((105.52268 28.85470, 105.52281 28.881..."
4,MGRS-49SCD-0298-2020-01,0.970183,,"POLYGON ((109.65503 39.28296, 109.65451 39.309..."
...,...,...,...,...
1091,,,CHN0793,POINT (113.18154 30.95448)
1092,,,CHN0830,POINT (106.82658 30.18893)
1093,,,CHN0833,POINT (103.21513 24.99869)
1094,,,CHN0836,POINT (106.89202 26.53952)


In [15]:
cement_df = cement_sample_df.drop(['tile_cmt_prob','cmtv4p1_uid'], axis=1)

#cement_df.to_crs("EPSG:4326").to_file("../../resources/macro-loc-model-deployment/chip_predictions_cement.shp", driver='ESRI Shapefile')

with fiona.drivers():
    cement_df.to_crs("EPSG:4326").rename(columns={'tile_id':'name'}).to_file("../../resources/macro-loc-model-deployment/chip_predictions_cement.kml", driver='KML')

cement_df['plant_found'] = ""
cement_df['class'] = ""
cement_df['level'] = ""
cement_df['imagery_date'] = ""
cement_df['status'] = ""
cement_df['notes'] = ""
cement_df.drop('geometry', axis=1).to_csv("../../resources/macro-loc-model-deployment/chip_predictions_cement.csv", index=False)

## Create Sample Steel Data

In [16]:
# pick 100 steel plants with known locations from chips so they off center - highest probabilities
known_steel_gdf = gdf[gdf.stlv4p1_uid.notnull()].sort_values('tile_stl_prob', ascending=False)[:100]

num_remaining_plants = 1000 - len(known_steel_gdf)

unknown_steel_gdf = gdf[gdf.stlv4p1_uid.isnull()].sort_values('tile_stl_prob', ascending=False)[:num_remaining_plants]

steel_sample_df = pd.concat([known_steel_gdf, unknown_steel_gdf]) \
                    .drop(['tile_cmt_prob','cmtv4p1_uid'], axis=1) \
                    .set_crs("EPSG:4326") \
                    .sample(frac=1, random_state=123)

In [17]:
steel_sites = steel_sites[steel_sites.uid.isin(steel_sample_df[steel_sample_df.stlv4p1_uid.notnull()].stlv4p1_uid.unique())]
steel_sites = steel_sites.rename(columns={'uid':'stlv4p1_uid'})

steel_sites.loc[:, 'tile_id'] = None
steel_sites.loc[:, 'tile_stl_prob'] = None

steel_sites = steel_sites[['tile_id','tile_stl_prob','stlv4p1_uid','geometry']]

In [18]:
steel_sample_df = gpd.GeoDataFrame(pd.concat([steel_sample_df, steel_sites], ignore_index=True))

In [19]:
steel_sample_df

Unnamed: 0,tile_id,tile_stl_prob,stlv4p1_uid,geometry
0,MGRS-45TVJ-0193-2020-01,1.000000,,"POLYGON ((86.49960 44.11727, 86.49937 44.14428..."
1,MGRS-49QDE-0241-2020-01,1.000000,,"POLYGON ((110.20380 22.25276, 110.20365 22.279..."
2,MGRS-49RFK-0006-2020-05,0.121518,CHN0078,"POLYGON ((112.58048 26.86986, 112.58086 26.896..."
3,MGRS-48QZK-0537-2020-01,1.000000,,"POLYGON ((108.71098 21.72522, 108.71167 21.752..."
4,MGRS-43SDB-0003-2020-01,1.000000,,"POLYGON ((74.71787 37.19011, 74.71777 37.21716..."
...,...,...,...,...
1040,,,CHN0239,POINT (114.42143 36.60022)
1041,,,CHN0240,POINT (113.74992 36.60033)
1042,,,CHN0241,POINT (113.74992 36.60033)
1043,,,CHN0242,POINT (114.44198 36.60349)


In [20]:
steel_gdf = steel_sample_df.drop(['tile_stl_prob','stlv4p1_uid'], axis=1)

# steel_gdf.to_crs("EPSG:4326").to_file("../../resources/macro-loc-model-deployment/chip_predictions_steel.shp", driver='ESRI Shapefile')

with fiona.drivers():
    steel_gdf.to_crs("EPSG:4326").rename(columns={'tile_id':'name'}).to_file("../../resources/macro-loc-model-deployment/chip_predictions_steel.kml", driver='KML')

steel_gdf['plant_found'] = ""
steel_gdf['class'] = ""
steel_gdf['level'] = ""
steel_gdf['imagery_date'] = ""
steel_gdf['status'] = ""
steel_gdf['notes'] = ""
steel_gdf.drop('geometry', axis=1).to_csv("../../resources/macro-loc-model-deployment/chip_predictions_steel.csv", index=False)

### Create Final Cement Dataset

In [25]:
cement_gdf = gdf[(gdf.tile_cmt_prob > 0.753) & (gdf.tile_cmt_prob <= 0.82)]
cement_df = cement_gdf.drop(['tile_cmt_prob','tile_stl_prob','cmtv4p1_uid','stlv4p1_uid'], axis=1).drop_duplicates()

In [27]:
cement_df.head()

Unnamed: 0,tile_id,geometry
2136,MGRS-50SLH-0991-2020-01,"POLYGON ((115.23728 37.88950, 115.23664 37.916..."
2155,MGRS-50SLH-0610-2020-01,"POLYGON ((115.29610 38.29590, 115.29547 38.322..."
2340,MGRS-50SLH-0744-2020-01,"POLYGON ((114.95775 38.12830, 114.95700 38.155..."
2380,MGRS-50SLH-0652-2020-01,"POLYGON ((114.88622 38.23516, 114.88544 38.262..."
2540,MGRS-50SLH-0203-2020-01,"POLYGON ((115.18443 38.64566, 115.18375 38.672..."


In [28]:
len(cement_df)

3047

In [29]:
with fiona.drivers():
    cement_df.to_crs("EPSG:4326").rename(columns={'tile_id':'name'}).to_file("../../resources/macro-loc-model-deployment/chip_predictions_cement_v2.kml", driver='KML')

cement_df['plant_found'] = ""
cement_df['class'] = ""
cement_df['level'] = ""
cement_df['imagery_date'] = ""
cement_df['status'] = ""
cement_df['notes'] = ""
cement_df.drop('geometry', axis=1).to_csv("../../resources/macro-loc-model-deployment/chip_predictions_cement_v2.csv", index=False)

### Create Final Steel Dataset

In [55]:
steel_gdf = gdf[gdf.tile_stl_prob > 0.99988]
steel_gdf = steel_gdf.drop(['tile_cmt_prob','tile_stl_prob','cmtv4p1_uid','stlv4p1_uid'], axis=1).drop_duplicates()

In [56]:
len(steel_gdf)

6026

In [57]:
steel_gdf.head()

Unnamed: 0,tile_id,geometry
1491,MGRS-48TWM-0053-2020-01,"POLYGON ((105.72414 41.99109, 105.72445 42.018..."
2040,MGRS-50SLH-0025-2020-01,"POLYGON ((115.87118 38.81651, 115.87075 38.843..."
3373,MGRS-44TQS-0032-2020-01,"POLYGON ((83.80085 46.48744, 83.80224 46.51441..."
3378,MGRS-44TQS-0037-2020-01,"POLYGON ((83.76045 46.46143, 83.76181 46.48839..."
3387,MGRS-44TQS-0174-2020-01,"POLYGON ((83.63427 46.27540, 83.63556 46.30237..."


In [58]:
with fiona.drivers():
    steel_gdf.to_crs("EPSG:4326").rename(columns={'tile_id':'name'}).to_file("../../resources/macro-loc-model-deployment/chip_predictions_steel.kml", driver='KML')

steel_gdf['plant_found'] = ""
steel_gdf['class'] = ""
steel_gdf['level'] = ""
steel_gdf['imagery_date'] = ""
steel_gdf['status'] = ""
steel_gdf['notes'] = ""
steel_gdf.drop('geometry', axis=1).to_csv("../../resources/macro-loc-model-deployment/chip_predictions_steel.csv", index=False)