## Import modules

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os

## Read in v4.1 curated asset data

In [None]:
assets_csv = "../../resources/asset-datasets-v4p1/cement_dataset_v4.1.csv"
assets_pd = pd.read_csv(assets_csv)

## Segment plants into subsamples

This creates separate files with the same (original) schema so that we can easily process them through different parts of the model build/deployment workflow.

In [None]:
if "asset-subsets-v4p1" not in os.listdir("../../resources"):
    os.mkdir("../../resources/asset-subsets-v4p1")

### Segment 1: Operating/non-operating plants

In [None]:
opr_assets_pd = assets_pd[assets_pd["status"] == "Operating"]
non_opr_assets_pd = assets_pd[assets_pd["status"] != "Operating"]

In [None]:
print("Global count of plants: ", len(assets_pd))
print("Global count of operating plants: ", len(opr_assets_pd))
print("Global count of non-operating plants: ", len(non_opr_assets_pd))

In [None]:
opr_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_operating_v4.1.csv",
                     index=False)
non_opr_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_non_operating_v4.1.csv",
                         index=False)

### Segment 2: Exact/approximate locations (operating plants)

In [None]:
ext_loc_assets_pd = opr_assets_pd[opr_assets_pd["accuracy"] == "Exact"]
appx_loc_assets_pd = opr_assets_pd[opr_assets_pd["accuracy"] == "Approximate"]

In [None]:
print("Global count of operating plants with exact locations: ", len(ext_loc_assets_pd))
print("Global count of operating plants with approximate locations: ", len(appx_loc_assets_pd))

In [None]:
ext_loc_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_exact_loc_v4.1.csv",
                         index=False)
appx_loc_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_approximate_loc_v4.1.csv",
                          index=False)

### Segment 3: China/Asia/Global (exact locations, operating plants)

Note:
* "Asia" refers to plants in Asia, excluding China
* "Global" refers to plants outside of Asia
* Names of segments reflect the cumulative nature of the dataset expansion (China -> Asia -> Global)

#### China - exact locations, operating plants

In [None]:
chn_ext_assets_pd = ext_loc_assets_pd[ext_loc_assets_pd["country"] == "China"]

In [None]:
chn_ext_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_exact_china_v4.1.csv",
                         index=False)

#### Asia - exact locations, operating plants

In [None]:
asia_cntry_pd = pd.read_csv("../../resources/ancillary/countries-asia-2020.csv")
asia_ext_assets_pd = ext_loc_assets_pd.merge(asia_cntry_pd, how="inner", on="country")
asia_ext_assets_pd = asia_ext_assets_pd[asia_ext_assets_pd["country"] != "China"]

In [None]:
asia_ext_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_exact_asia_v4.1.csv",
                          index=False)

#### Global - exact locations, operating plants

In [None]:
gbl_ext_assets_pd = ext_loc_assets_pd[~ext_loc_assets_pd["uid"].isin(asia_ext_assets_pd["uid"].tolist()) & \
                                      ~ext_loc_assets_pd["uid"].isin(chn_ext_assets_pd["uid"].tolist())]

In [None]:
gbl_ext_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_exact_global_v4.1.csv",
                         index=False)

In [None]:
print("Count of operating plants with exact locations in China: ", len(chn_ext_assets_pd))
print("Count of operating plants with exact locations in rest of Asia: ", len(asia_ext_assets_pd))
print("Count of operating plants with exact locations in rest of world: ", len(gbl_ext_assets_pd))

### Segment 4: China/Asia/Global (approximate locations, operating plants)

Note:
* "Asia" refers to plants in Asia, excluding China
* "Global" refers to plants outside of Asia
* Names of segments reflect the cumulative nature of the dataset expansion (China -> Asia -> Global)

#### China - approximate locations, operating plants

In [None]:
chn_appx_assets_pd = appx_loc_assets_pd[appx_loc_assets_pd["country"] == "China"]

In [None]:
chn_appx_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_approximate_china_v4.1.csv",
                          index=False)

#### Asia - approximate locations, operating plants

In [None]:
asia_appx_assets_pd = appx_loc_assets_pd.merge(asia_cntry_pd, how="inner", on="country")
asia_appx_assets_pd = asia_appx_assets_pd[asia_appx_assets_pd["country"] != "China"]

In [None]:
asia_appx_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_approximate_asia_v4.1.csv",
                           index=False)

#### Global - approximate locations, operating plants

In [None]:
gbl_appx_assets_pd = appx_loc_assets_pd[~appx_loc_assets_pd["uid"].isin(asia_appx_assets_pd["uid"].tolist()) & \
                                        ~appx_loc_assets_pd["uid"].isin(chn_appx_assets_pd["uid"].tolist())]

In [None]:
gbl_appx_assets_pd.to_csv("../../resources/asset-subsets-v4p1/cement_approximate_global_v4.1.csv",
                          index=False)

In [None]:
print("Count of operating plants with approximate locations in China: ", len(chn_appx_assets_pd))
print("Count of operating plants with exact approximate in rest of Asia: ", len(asia_appx_assets_pd))
print("Count of operating plants with exact approximate in rest of world: ", len(gbl_appx_assets_pd))