## Import required libraries

In [1]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import glob
from tqdm import tqdm

## Parameters

In [2]:
years_sets = zip(['2020', '2020', '2019', '2019', '2018', '2018'],
                 ['1', '2', '1', '2', '1', '2'])
pred_thresh = 0.01

In [3]:
s3_path = 'L8-TIR-macro-localization-model-deployment'
score_tar_prefix = 'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'

cement_site_geojson = "../../resources/macro-loc-model-build/cement_exact_china_v4.1.geojson"
steel_site_geojson = "../../resources/macro-loc-model-build/steel_exact_china_v4.1.geojson"

macro_10km_shp = "../../resources/nt-model/10km_CS_macro/macroloc_cement_steel_CHN_10.shp"

LOCAL_DIR = '/scratch/'

In [4]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

## Prepare Input Files

In [5]:
# Loop over all years and sets
for yr, st in years_sets:

    # Download tar file from S3 and untar
    score_tar_file = score_tar_prefix+yr+'_set'+st+'.tar'
    bucket.download_file(s3_path+'/'+score_tar_file, LOCAL_DIR+score_tar_file)
    !tar -xf {LOCAL_DIR+score_tar_file} -C {LOCAL_DIR}
    print("Finished downloading and extracting ", score_tar_file)

    # Read in GeoJSONs of known cement and steel plants
    cement_site_gdf = gpd.read_file(cement_site_geojson)
    steel_site_gdf = gpd.read_file(steel_site_geojson)

    # Read in shapefile of deployment grid, filter by pred_thresh
    macro_10km_gdf = gpd.read_file(macro_10km_shp)
    macro_10km_gdf = macro_10km_gdf[macro_10km_gdf.preds >= pred_thresh]

    # Get list of GeoJSONS
    score_dir = score_tar_file.replace('.tar', '')
    score_gjsons = glob.glob(LOCAL_DIR+score_dir+'/*.geojson')
    score_gjsons.sort()

    # Loop over all GeoJSONS
    for gf in score_gjsons:

        # Read in DataFrame
        score_gdf = gpd.read_file(gf)

        # Join 10km preds; take max if tile intersects two grid sections
        score_gdf = gpd.sjoin(score_gdf, macro_10km_gdf, how='left', 
                              op='intersects')
        score_gdf = score_gdf.drop(['scene_id', 'index_right', 'length', 
                                    'length_w', 'Count_pnt'], axis=1)
        score_gdf['grpid'] = score_gdf['tile_id']
        score_gdf = score_gdf.sort_values('preds', ascending=False) \
                             .groupby(['grpid']).first()

        # Join to known cement plant sites
        score_gdf = gpd.sjoin(score_gdf, cement_site_gdf, how='left', 
                              op='intersects')
        score_gdf['cement_uid'] = score_gdf.uid
        score_gdf = score_gdf.drop(['index_right', 'uid'], axis=1)

        # Join to known steel plant sites
        score_gdf = gpd.sjoin(score_gdf, steel_site_gdf, how='left', 
                              op='intersects')
        score_gdf['steel_uid'] = score_gdf.uid
        score_gdf = score_gdf.drop(['index_right', 'uid'], axis=1)

        print('Done compiling scores for ', len(score_gdf), ' chips in ', gf)

        # Append to compiled scores file, droping geom column
        if 'compiled_scores_pdf' in locals():
            compiled_scores_pdf = pd.concat([compiled_scores_pdf, 
                                             score_gdf], 
                                            ignore_index=True)
        else:
            compiled_scores_pdf = score_gdf

# Save results in geojson
compiled_scores_pdf.to_file("../../resources/macro-loc-model-deployment/scored_output.geojson", driver='GeoJSON')

# # # Upload to S3 (too bit for git)
# # bucket.upload_file(compiled_scores_csv, s3_path+'/'+compiled_scores_csv.split('/')[-1])

Finished downloading and extracting  L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_set1.tar
Done compiling scores for  2553  chips in  /scratch/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_set1/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_WRS2-114026.geojson
Done compiling scores for  10223  chips in  /scratch/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_set1/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_WRS2-114027.geojson
Done compiling scores for  9403  chips in  /scratch/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_set1/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_WRS2-114028.geojson
Done compiling scores for  3234  chips in  /scratch/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_set1/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_WRS2-114029.geojson
Done compiling scores for  3347  chips in  /scratch/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_set1/L8-deployment-chip-scores-CHN-10km-pthsh0.002_2020_WRS2-114030.geojson
Done co

### Filter Down Sample Data

In [6]:
# pick 200 cement plants with known locations from chips so they off center - highest probabilities
known_cement_pdf = compiled_scores_pdf[compiled_scores_pdf.cement_uid.notnull()]

cement_df = pd.read_excel("../../resources/asset-datasets-v4/cement_dataset_v4.xlsx")
cement_df = cement_df[['uid', 'plant_type']]

known_cement_pdf = pd.merge(known_cement_pdf, cement_df, left_on='cement_uid', right_on='uid')

# half grinding
known_grinding_pdf = known_cement_pdf[known_cement_pdf.plant_type == 'Grinding'].drop('plant_type', axis=1)

# half integrated
known_integrated_pdf = known_cement_pdf[known_cement_pdf.plant_type == 'Integrated'].sort_values('cement_prob', ascending=False)[:100].drop('plant_type', axis=1)

num_remaining_plants = 1000 - len(known_grinding_pdf) - len(known_integrated_pdf)

unknown_cement_pdf = compiled_scores_pdf[compiled_scores_pdf.cement_uid.isnull()].sort_values('cement_prob', ascending=False)[:num_remaining_plants]

cement_sample_df = pd.concat([known_grinding_pdf, known_integrated_pdf, unknown_cement_pdf]) \
                     .drop(['steel_prob','index','prop_rail','prop_water','preds','steel_uid','uid'], axis=1) \
                     .set_crs("EPSG:4326")

In [7]:
# pick 100 steel plants with known locations from chips so they off center - highest probabilities
known_steel_pdf = compiled_scores_pdf[compiled_scores_pdf.steel_uid.notnull()].sort_values('steel_prob', ascending=False)[:100]

unknown_steel_pdf = compiled_scores_pdf[compiled_scores_pdf.steel_uid.isnull()].sort_values('steel_prob', ascending=False)[:900]

steel_sample_df = pd.concat([known_steel_pdf, unknown_steel_pdf]) \
                    .drop(['cement_prob','index','prop_rail','prop_water','preds','cement_uid'], axis=1) \
                    .set_crs("EPSG:4326")

## Create Output Files

In [8]:
percent_threshold = 0.5

### Cement

In [9]:
cement_gdf = cement_sample_df.copy()
cement_gdf['geometry'] = cement_gdf['geometry'].to_crs("EPSG:3857")

index_checked = []
index_to_drop = []

for idx1, row1 in tqdm(cement_gdf.iterrows(), total=len(cement_gdf)):
    for idx2, row2 in cement_gdf.drop(index_checked).iterrows():
        if row1.tile_id != row2.tile_id and row1.cement_uid != row2.cement_uid:
            area = row1.geometry.intersection(row2.geometry).area / row1.geometry.area
            if area >= percent_threshold:
                index_to_drop.append(idx2)
    index_checked.append(idx1)
    
cement_gdf = cement_gdf.drop(index_to_drop).sample(frac=1)
   
cement_gdf.to_crs("EPSG:4326").to_file("../../resources/macro-loc-model-deployment/chip_predictions_cement.shp", driver='ESRI Shapefile')

cement_gdf['plant_found'] = ""
cement_gdf['status'] = ""
cement_gdf['notes'] = ""
cement_gdf.drop('geometry', axis=1).to_csv("../../resources/macro-loc-model-deployment/chip_predictions_cement.csv", index=False)

100%|██████████| 1000/1000 [01:55<00:00,  8.70it/s]


In [10]:
tmp = cement_sample_df.groupby('tile_id').count()
tmp[tmp.year > 1]

Unnamed: 0_level_0,year,cement_prob,geometry,cement_uid
tile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [11]:
tmp = cement_gdf.groupby('tile_id').count()
tmp[tmp.year > 1]

Unnamed: 0_level_0,year,cement_prob,geometry,cement_uid,plant_found,status,notes
tile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [12]:
cement_gdf[cement_gdf.cement_uid.notnull()]

Unnamed: 0,tile_id,year,cement_prob,geometry,cement_uid,plant_found,status,notes
242,WRS2-124034-02989,2018,0.784728,"POLYGON ((12724807.533 4598258.618, 12724768.8...",CHN0308,,,
462,WRS2-120043-06999,2018,0.882097,"POLYGON ((12929142.137 2826127.275, 12929134.9...",CHN0272,,,
11,WRS2-117030-07449,2020,0.816240,"POLYGON ((14029827.615 5337257.961, 14029776.6...",CHN0424,,,
438,WRS2-121037-03136,2019,0.261919,"POLYGON ((13073234.831 3884464.501, 13073240.0...",CHN0005,,,
81,WRS2-119038-10939,2020,0.336833,"POLYGON ((13421809.281 3762066.647, 13421781.4...",CHN0008,,,
...,...,...,...,...,...,...,...,...
99,WRS2-120033-02028,2020,0.889462,"POLYGON ((13517035.141 4805014.929, 13517011.3...",CHN0742,,,
315,WRS2-127039-21545,2020,0.769183,"POLYGON ((11883168.143 3511082.315, 11883186.6...",CHN0656,,,
572,WRS2-124034-13001,2018,0.861019,"POLYGON ((12740209.400 4522648.270, 12740173.4...",CHN0089,,,
448,WRS2-121038-19736,2019,0.429334,"POLYGON ((12948848.060 3730289.754, 12948840.3...",CHN0012,,,


In [13]:
cement_gdf[cement_gdf.cement_uid.isnull()]

Unnamed: 0,tile_id,year,cement_prob,geometry,cement_uid,plant_found,status,notes
1876311,WRS2-118025-01646,2019,0.981975,"POLYGON ((14201342.183 6488296.068, 14201310.7...",,,,
117644,WRS2-118038-05007,2020,0.974468,"POLYGON ((13463301.374 3726404.822, 13463278.0...",,,,
1425271,WRS2-135037-02195,2020,0.964258,"POLYGON ((10583998.181 3842597.606, 10583952.2...",,,,
658179,WRS2-125029-00906,2020,0.965552,"POLYGON ((12674701.069 5474006.108, 12674645.4...",,,,
2825286,WRS2-122041-10524,2018,0.970591,"POLYGON ((12805138.757 3062434.211, 12805120.7...",,,,
...,...,...,...,...,...,...,...,...
1241748,WRS2-132041-05785,2020,0.975555,"POLYGON ((11005232.223 3131846.504, 11005230.9...",,,,
1626660,WRS2-140040-03553,2020,0.961111,"POLYGON ((9693258.036 3320361.311, 9693258.794...",,,,
1447680,WRS2-136034-01926,2020,0.966957,"POLYGON ((10596854.670 4572974.146, 10596885.9...",,,,
2440563,WRS2-130034-07058,2019,0.961941,"POLYGON ((11669196.327 4405583.699, 11669193.9...",,,,


### Steel

In [14]:
steel_gdf = steel_sample_df.copy()
steel_gdf['geometry'] = steel_gdf['geometry'].to_crs("EPSG:3857")

index_checked = []
index_to_drop = []

for idx1, row1 in tqdm(steel_gdf.iterrows(), total=len(steel_gdf)):
    for idx2, row2 in steel_gdf.drop(index_checked).iterrows():
        if row1.tile_id != row2.tile_id and row1.steel_uid != row2.steel_uid:
            area = row1.geometry.intersection(row2.geometry).area / row1.geometry.area
            if area >= percent_threshold:
                index_to_drop.append(idx2)
    index_checked.append(idx1)
    
steel_gdf = steel_gdf.drop(index_to_drop).sample(frac=1)

steel_gdf.to_crs("EPSG:4326").to_file("../../resources/macro-loc-model-deployment/chip_predictions_steel.shp", driver='ESRI Shapefile')

steel_gdf['plant_found'] = ""
steel_gdf['status'] = ""
steel_gdf['notes'] = ""
steel_gdf.drop('geometry', axis=1).to_csv("../../resources/macro-loc-model-deployment/chip_predictions_steel.csv", index=False)

100%|██████████| 1000/1000 [01:55<00:00,  8.68it/s]


In [15]:
tmp = steel_sample_df.groupby('tile_id').count()
tmp[tmp.year > 1]

Unnamed: 0_level_0,year,steel_prob,geometry,steel_uid
tile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WRS2-120027-07627,2,2,2,2
WRS2-122032-02198,2,2,2,2
WRS2-122033-04288,2,2,2,2
WRS2-125034-01355,2,2,2,2


In [16]:
tmp = steel_gdf.groupby('tile_id').count()
tmp[tmp.year > 1]

Unnamed: 0_level_0,year,steel_prob,geometry,steel_uid,plant_found,status,notes
tile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
WRS2-120027-07627,2,2,2,2,2,2,2
WRS2-122032-02198,2,2,2,2,2,2,2
WRS2-122033-04288,2,2,2,2,2,2,2
WRS2-125034-01355,2,2,2,2,2,2,2


In [17]:
steel_gdf[steel_gdf.steel_uid.notnull()]

Unnamed: 0,tile_id,year,steel_prob,geometry,steel_uid,plant_found,status,notes
2311850,WRS2-124035-05470,2019,0.880074,"POLYGON ((12679626.043 4399741.954, 12679665.6...",CHN0097,,,
182137,WRS2-119031-07108,2020,0.906509,"POLYGON ((13690022.700 5032236.334, 13690022.3...",CHN0007,,,
92870,WRS2-118031-03489,2020,0.945888,"POLYGON ((13773558.495 5053286.214, 13773570.2...",CHN0006,,,
224184,WRS2-119038-19139,2020,0.897274,"POLYGON ((13430516.951 3759774.647, 13430490.0...",CHN0119,,,
1964969,WRS2-120027-07627,2019,0.722509,"POLYGON ((13760529.916 5972927.497, 13760542.0...",CHN0004,,,
...,...,...,...,...,...,...,...,...
218147,WRS2-119038-06814,2020,0.829875,"POLYGON ((13365925.928 3724695.079, 13365892.7...",CHN0107,,,
2851773,WRS2-124035-05013,2018,0.933408,"POLYGON ((12739219.087 4383320.287, 12739265.8...",CHN0242,,,
2614849,WRS2-143029-04265,2019,0.644188,"POLYGON ((9717942.915 5441576.253, 9717948.151...",CHN0014,,,
532571,WRS2-123031-10950,2020,0.945939,"POLYGON ((13103445.864 5003994.377, 13103457.1...",CHN0005,,,


In [18]:
steel_gdf[steel_gdf.steel_uid.isnull()]

Unnamed: 0,tile_id,year,steel_prob,geometry,steel_uid,plant_found,status,notes
530379,WRS2-123031-05719,2020,0.943148,"POLYGON ((13106658.741 5055679.534, 13106670.6...",,,,
604896,WRS2-124032-02893,2020,0.952340,"POLYGON ((12874585.222 4870251.412, 12874564.5...",,,,
310512,WRS2-120037-10891,2020,0.948809,"POLYGON ((13312521.337 3797391.667, 13312551.2...",,,,
1146890,WRS2-131042-10488,2020,0.943530,"POLYGON ((11116865.718 2928287.318, 11116873.2...",,,,
973311,WRS2-130039-07616,2020,0.958589,"POLYGON ((11394882.987 3668141.961, 11394853.6...",,,,
...,...,...,...,...,...,...,...,...
222303,WRS2-119038-15323,2020,0.966168,"POLYGON ((13444666.625 3672157.877, 13444641.8...",,,,
1479761,WRS2-136040-07301,2020,0.943151,"POLYGON ((10406011.823 3389766.195, 10406016.7...",,,,
1261541,WRS2-132043-07195,2020,0.960443,"POLYGON ((10872675.680 2802839.808, 10872664.6...",,,,
1278466,WRS2-133033-05635,2020,0.944694,"POLYGON ((11137065.381 4658384.873, 11137080.6...",,,,
