# Explore Results from Infrastructure Model for Use in Deployment

* Currently covers China only

## Import Libraries

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

## Input/output Files and Parameters

### Input Files

* 10km CS macro grid
* 5km CS macro grid

In [None]:
macro_10km_shp = "../../resources/nt-model/10km_CS_macro/macroloc_cement_steel_CHN_10.shp"
macro_5km_shp = "../../resources/nt-model/5km_CS_macro/macroloc_cement_steel_CHN1.shp"

* Cement and steel plants with exact and approximate locations

In [None]:
cement_ext_csv = "../../resources/asset-subsets-v4p1/cement_exact_china_v4.1.csv"
cement_appx_csv = "../../resources/asset-subsets-v4p1/cement_approximate_china_v4.1.csv"
steel_ext_csv = "../../resources/asset-subsets-v4p1/steel_exact_china_v4.1.csv"
steel_appx_csv = "../../resources/asset-subsets-v4p1/steel_approximate_china_v4.1.csv"

### Output Files

* GeoJSON files for cement and steel plants (used to explore deployment area options)

In [None]:
cement_ext_gjson = "../../resources/nt-model/cement_exact_china_v4.1.geojson"
cement_appx_gjson = "../../resources/nt-model/cement_approximate_china_v4.1.geojson"
steel_ext_gjson = "../../resources/nt-model/steel_exact_china_v4.1.geojson"
steel_appx_gjson = "../../resources/nt-model/steel_approximate_china_v4.1.geojson"

### Parameters

In [None]:
calc_crs = "EPSG:3395"

china_area = 9326410.0  #sq km

l8_chip_size = 35  #pixels
l8_gsd = 0.03  #km

s2_chip_size = 300  #pixels
s2_gsd = 0.01  #km

## Read in Results

### 10km CS Macro

In [None]:
macro_10km_gdf = gpd.read_file(macro_10km_shp)

In [None]:
macro_10km_gdf.head(10)

In [None]:
print("CRS: ", macro_10km_gdf.crs)
print("Number of grid cells in 10km CS Macro: ", len(macro_10km_gdf))
print("Count of unique index: ", macro_10km_gdf.index.nunique())

### 5km CS Macro

In [None]:
macro_5km_gdf = gpd.read_file(macro_5km_shp)

In [None]:
macro_5km_gdf.head(10)

In [None]:
print("CRS: ", macro_5km_gdf.crs)
print("Number of grid cells in 5km CS Macro: ", len(macro_5km_gdf))
print("Count of unique index: ", macro_5km_gdf.index.nunique())

### Cement Plants

In [None]:
cement_ext_df = pd.read_csv(cement_ext_csv, index_col=False)
cement_ext_gdf = gpd.GeoDataFrame(cement_ext_df.uid,
                                  geometry=gpd.points_from_xy(cement_ext_df.longitude, 
                                                              cement_ext_df.latitude),
                                  crs="EPSG:4326")

In [None]:
cement_appx_df = pd.read_csv(cement_appx_csv, index_col=False)
cement_appx_gdf = gpd.GeoDataFrame(cement_appx_df.uid,
                                  geometry=gpd.points_from_xy(cement_appx_df.longitude, 
                                                              cement_appx_df.latitude),
                                  crs="EPSG:4326")

In [None]:
cement_ext_gdf.to_file(cement_ext_gjson, driver='GeoJSON')
cement_appx_gdf.to_file(cement_appx_gjson, driver='GeoJSON')

### Steel Plants

In [None]:
steel_ext_df = pd.read_csv(steel_ext_csv, index_col=False)
steel_ext_gdf = gpd.GeoDataFrame(steel_ext_df.uid,
                                  geometry=gpd.points_from_xy(steel_ext_df.longitude, 
                                                              steel_ext_df.latitude),
                                  crs="EPSG:4326")

In [None]:
steel_appx_df = pd.read_csv(steel_appx_csv, index_col=False)
steel_appx_gdf = gpd.GeoDataFrame(steel_appx_df.uid,
                                  geometry=gpd.points_from_xy(steel_appx_df.longitude, 
                                                              steel_appx_df.latitude),
                                  crs="EPSG:4326")

In [None]:
steel_ext_gdf.to_file(steel_ext_gjson, driver='GeoJSON')
steel_appx_gdf.to_file(steel_appx_gjson, driver='GeoJSON')

## Examine Model Accuracy and Deployment Region Considerations

Questions to answer:
1. What fraction of plants with exact locations are within the grid cells? Approximates recall.
2. What fraction of plants with exact locations are not within the grid cells? Approximates false-negative rate.
3. How much total area would we have to survey?
4. How many Landsat-8 chips would need to be created (approximate only)?
5. How many Sentinel-2 chips would need to be created (approximate only)?

In [None]:
l8_chip_area = pow(l8_gsd * l8_chip_size,2)  #sq km
s2_chip_area = pow(s2_gsd * s2_chip_size,2)  #sq km

### Macro 10km Grid

In [None]:
macro_10km_phys = macro_10km_gdf.to_crs(calc_crs)
macro_10km_area = sum(macro_10km_phys.area / 1e6)

In [None]:
l8_10km_chip_cnt = macro_10km_area / l8_chip_area
s2_10km_chip_cnt = macro_10km_area / s2_chip_area

In [None]:
print("Deployment Area for 10km Grid: ", macro_10km_area, " sq km")
print("Percent Area of China: ", macro_10km_area/china_area * 100, "%")
print("Number of Landsat-8 Chips Required: ", round(l8_10km_chip_cnt))
print("Number of Sentinel-2 Chips Required: ", round(s2_10km_chip_cnt))

### Macro 5km Grid

In [None]:
macro_5km_phys = macro_5km_gdf.to_crs(calc_crs)
macro_5km_area = sum(macro_5km_phys.area / 1e6)

In [None]:
l8_5km_chip_cnt = macro_5km_area / l8_chip_area
s2_5km_chip_cnt = macro_5km_area / s2_chip_area

In [None]:
print("Deployment Area for 5km Grid: ", macro_5km_area, " sq km")
print("Percent Area of China: ", macro_5km_area/china_area * 100, "%")
print("Number of Landsat-8 Chips Required: ", round(l8_5km_chip_cnt))
print("Number of Sentinel-2 Chips Required: ", round(s2_5km_chip_cnt))

### Cement

#### 10km Grid

In [None]:
cement_ext_10km_intsct = gpd.sjoin(macro_10km_gdf, cement_ext_gdf, how="inner", op="intersects")

In [None]:
cement_10km_tpr = len(cement_ext_10km_intsct) / (len(cement_ext_gdf)) * 100

In [None]:
print ("Cement Recall, 10km Grid: ", cement_10km_tpr, "%")
print ("Cement Miss Rate, 10km Grid: ", 100 - cement_10km_tpr, "%")

#### 5km Grid

In [None]:
cement_ext_5km_intsct = gpd.sjoin(macro_5km_gdf, cement_ext_gdf, how="inner", op="intersects")

In [None]:
cement_5km_tpr = len(cement_ext_5km_intsct) / (len(cement_ext_gdf)) * 100

In [None]:
print ("Cement Recall, 5km Grid: ", cement_5km_tpr, "%")
print ("Cement Miss Rate, 5km Grid: ", 100 - cement_5km_tpr, "%")

#### 10km Grid with Thresholds

In [None]:
preds_thresh = np.arange(0, 0.5, 0.001)
area_thresh_10km = [sum(macro_10km_phys[macro_10km_phys.preds >= thresh].area / 1e6) \
               for thresh in preds_thresh]
cnt_thresh = [cement_ext_10km_intsct[cement_ext_10km_intsct.preds >= thresh].uid.nunique() \
              for thresh in preds_thresh]

In [None]:
cement_survey_10km_df = pd.DataFrame({"thresh": preds_thresh,
                                 "area": area_thresh_10km,
                                 "cnt": cnt_thresh})
cement_survey_10km_df['tpr'] = cement_survey_10km_df.cnt / len(cement_ext_gdf) * 100
cement_survey_10km_df['fnr'] = 100 - cement_survey_10km_df.tpr
cement_survey_10km_df['pct_area'] = cement_survey_10km_df.area / china_area * 100
cement_survey_10km_df['l8_chp_cnt'] = round(cement_survey_10km_df.area / l8_chip_area)
cement_survey_10km_df['s2_chp_cnt'] = round(cement_survey_10km_df.area / s2_chip_area)

#### 5km Grid with Thresholds

In [None]:
area_thresh_5km = [sum(macro_5km_phys[macro_5km_phys.preds >= thresh].area / 1e6) \
               for thresh in preds_thresh]
cnt_thresh = [cement_ext_5km_intsct[cement_ext_5km_intsct.preds >= thresh].uid.nunique() \
              for thresh in preds_thresh]

In [None]:
cement_survey_5km_df = pd.DataFrame({"thresh": preds_thresh,
                                 "area": area_thresh_5km,
                                 "cnt": cnt_thresh})
cement_survey_5km_df['tpr'] = cement_survey_5km_df.cnt / len(cement_ext_gdf) * 100
cement_survey_5km_df['fnr'] = 100 - cement_survey_5km_df.tpr
cement_survey_5km_df['pct_area'] = cement_survey_5km_df.area / china_area * 100
cement_survey_5km_df['l8_chp_cnt'] = round(cement_survey_5km_df.area / l8_chip_area)
cement_survey_5km_df['s2_chp_cnt'] = round(cement_survey_5km_df.area / s2_chip_area)

In [None]:
plt.title("Cement: Recall Versus Prediction Threshold") 
plt.xlabel("Prediction Minimum Threshold") 
plt.ylabel("Recall (True Positive Rate)") 
plt.plot(cement_survey_5km_df.thresh, cement_survey_5km_df.tpr, 'b-', label='5km Grid')
plt.plot(cement_survey_10km_df.thresh, cement_survey_10km_df.tpr, 'g--', label='10km Grid')
plt.plot(cement_survey_10km_df.thresh[2], cement_survey_10km_df.tpr[2], 'g*', label='thresh=0.002')
plt.legend()
plt.show()

In [None]:
plt.title("Cement: Recall Versus Survey Area") 
plt.xlabel("Survey Area (sq km)") 
plt.ylabel("Recall (True Positive Rate)") 
plt.plot(cement_survey_5km_df.area, cement_survey_5km_df.tpr, 'b-', label='5km Grid') 
plt.plot(cement_survey_10km_df.area, cement_survey_10km_df.tpr, 'g--', label='10km Grid')
plt.plot(cement_survey_10km_df.area[2], cement_survey_10km_df.tpr[2], 'g*', label='thresh=0.002')
plt.legend()
plt.show()

In [None]:
cement_survey_10km_df

In [None]:
plt.title("Cement: Recall Versus No. Landsat-8 Chips") 
plt.xlabel("Count of Landsat-8 Chips") 
plt.ylabel("Recall (True Positive Rate)") 
plt.plot(cement_survey_5km_df.l8_chp_cnt, cement_survey_5km_df.tpr, 'b-', label='5km Grid') 
plt.plot(cement_survey_10km_df.l8_chp_cnt, cement_survey_10km_df.tpr, 'g--', label='10km Grid')
plt.legend()
plt.show()

In [None]:
plt.title("Cement: Recall Versus No. Sentinel-2 Chips") 
plt.xlabel("Count of Sentinel-2 Chips") 
plt.ylabel("Recall (True Positive Rate)") 
plt.plot(cement_survey_5km_df.s2_chp_cnt, cement_survey_5km_df.tpr, 'b-', label='5km Grid') 
plt.plot(cement_survey_10km_df.s2_chp_cnt, cement_survey_10km_df.tpr, 'g--', label='10km Grid')
plt.legend()
plt.show()

### Steel

#### 10km Grid

In [None]:
steel_ext_10km_intsct = gpd.sjoin(macro_10km_gdf, steel_ext_gdf, how="inner", op="intersects")

In [None]:
steel_10km_tpr = len(steel_ext_10km_intsct) / (len(steel_ext_gdf)) * 100

In [None]:
print ("Steel Recall, 10km Grid: ", steel_10km_tpr, "%")
print ("Steel Miss Rate, 10km Grid: ", 100 - steel_10km_tpr, "%")

#### 5km Grid

In [None]:
steel_ext_5km_intsct = gpd.sjoin(macro_5km_gdf, steel_ext_gdf, how="inner", op="intersects")

In [None]:
steel_5km_tpr = len(steel_ext_5km_intsct) / (len(steel_ext_gdf)) * 100

In [None]:
print ("Steel Recall, 5km Grid: ", steel_5km_tpr, "%")
print ("Steel Miss Rate, 5km Grid: ", 100 - steel_5km_tpr, "%")

#### 10km Grid with Thresholds

In [None]:
cnt_thresh = [steel_ext_10km_intsct[steel_ext_10km_intsct.preds >= thresh].uid.nunique() \
              for thresh in preds_thresh]

In [None]:
steel_survey_10km_df = pd.DataFrame({"thresh": preds_thresh,
                                 "area": area_thresh_10km,
                                 "cnt": cnt_thresh})
steel_survey_10km_df['tpr'] = steel_survey_10km_df.cnt / len(steel_ext_gdf) * 100
steel_survey_10km_df['fnr'] = 100 - steel_survey_10km_df.tpr
steel_survey_10km_df['pct_area'] = steel_survey_10km_df.area / china_area * 100
steel_survey_10km_df['l8_chp_cnt'] = round(steel_survey_10km_df.area / l8_chip_area)
steel_survey_10km_df['s2_chp_cnt'] = round(steel_survey_10km_df.area / s2_chip_area)

#### 5km Grid with Thresholds

In [None]:
cnt_thresh = [steel_ext_5km_intsct[steel_ext_5km_intsct.preds >= thresh].uid.nunique() \
              for thresh in preds_thresh]

In [None]:
steel_survey_5km_df = pd.DataFrame({"thresh": preds_thresh,
                                 "area": area_thresh_5km,
                                 "cnt": cnt_thresh})
steel_survey_5km_df['tpr'] = steel_survey_5km_df.cnt / len(steel_ext_gdf) * 100
steel_survey_5km_df['fnr'] = 100 - steel_survey_5km_df.tpr
steel_survey_5km_df['pct_area'] = steel_survey_5km_df.area / china_area * 100
steel_survey_5km_df['l8_chp_cnt'] = round(steel_survey_5km_df.area / l8_chip_area)
steel_survey_5km_df['s2_chp_cnt'] = round(steel_survey_5km_df.area / s2_chip_area)

In [None]:
plt.title("Steel: Recall Versus Prediction Threshold") 
plt.xlabel("Prediction Minimum Threshold") 
plt.ylabel("Recall (True Positive Rate)") 
plt.plot(steel_survey_5km_df.thresh, steel_survey_5km_df.tpr, 'b-', label='5km Grid') 
plt.plot(steel_survey_10km_df.thresh, steel_survey_10km_df.tpr, 'g--', label='10km Grid') 
plt.plot(steel_survey_10km_df.thresh[2], steel_survey_10km_df.tpr[2], 'g*', label='thresh=0.002')
plt.legend()
plt.show()

In [None]:
plt.title("Steel: Recall Versus Survey Area") 
plt.xlabel("Survey Area (sq km)") 
plt.ylabel("Recall (True Positive Rate)") 
plt.plot(steel_survey_5km_df.area, steel_survey_5km_df.tpr, 'b-', label='5km Grid') 
plt.plot(steel_survey_10km_df.area, steel_survey_10km_df.tpr, 'g--', label='10km Grid') 
plt.plot(steel_survey_10km_df.area[2], steel_survey_10km_df.tpr[2], 'g*', label='thresh=0.002')
plt.legend()
plt.show()

In [None]:
steel_survey_10km_df

In [None]:
plt.title("Steel: Recall Versus No. Landsat-8 Chips") 
plt.xlabel("Count of Landsat-8 Chips") 
plt.ylabel("Recall (True Positive Rate)") 
plt.plot(steel_survey_5km_df.l8_chp_cnt, steel_survey_5km_df.tpr, 'b-', label='5km Grid') 
plt.plot(steel_survey_10km_df.l8_chp_cnt, steel_survey_10km_df.tpr, 'g--', label='10km Grid') 
plt.legend()
plt.show()

In [None]:
plt.title("Steel: Recall Versus No. Sentinel-2 Chips") 
plt.xlabel("Count of Sentinel-2 Chips") 
plt.ylabel("Recall (True Positive Rate)") 
plt.plot(steel_survey_5km_df.l8_chp_cnt, steel_survey_5km_df.tpr, 'b-', label='5km Grid') 
plt.plot(steel_survey_10km_df.l8_chp_cnt, steel_survey_10km_df.tpr, 'g--', label='10km Grid') 
plt.legend()
plt.show()