# Create v4.1 of Cement and Steel Plants

This notebook fixes some minor issues with unique asset identifiers in v4. Some distinct cement and steel plants have the same **uid**. This script creates v4.1, which appends, **\_a**, **\_b**, etc. to those identifiers to make the unique.

## Import modules

In [None]:
import pandas as pd
import numpy as np
import os

## Read in v4 curated cement and steel data

In [None]:
# Read in cement data - v4
cement_xlsx = "../../resources/asset-datasets-v4/cement_dataset_v4.xlsx"
cement_pd = pd.read_excel(cement_xlsx)

# Read in steel data - v4
steel_xlsx = "../../resources/asset-datasets-v4/steel_dataset_v4.xlsx"
steel_pd = pd.read_excel(steel_xlsx)

In [None]:
if "tmp" not in os.listdir():
    os.mkdir("tmp")

if "asset-datasets-v4p1" not in os.listdir("../../resources"):
    os.mkdir("../../resources/asset-datasets-v4p1")

## Fix cement plant duplicates

In [None]:
cement_dup_pd = cement_pd[cement_pd.duplicated(subset="uid", keep=False)]
cement_dup_pd.to_csv("tmp/cement_v4_duplicates.csv")

In [None]:
# List of duplicated uids
cement_uids_fix = cement_dup_pd["uid"].unique().tolist()

# Loop over list of duplicated uids
for uid in cement_uids_fix:
    cement_pd.loc[cement_pd["uid"] == uid, "uid"] = [uid+"a", uid+"b"]

In [None]:
# Write out to csv
cement_pd.to_csv("../../resources/asset-datasets-v4p1/cement_dataset_v4.1.csv",
                 index=False)

## Fix steel plant duplicates

In [None]:
steel_dup_pd = steel_pd[steel_pd.duplicated(subset="uid", keep=False)]
steel_dup_pd.to_csv("tmp/steel_v4_duplicates.csv")

In [None]:
# List of duplicated uids
steel_uids_fix = steel_dup_pd["uid"].unique().tolist()

# Loop over list of duplicated uids
for uid in steel_uids_fix:
    if uid in ["BRA0021", "JPN0010"]:
        steel_pd.loc[steel_pd["uid"] == uid, "uid"] = [uid+"a", uid+"b", uid+"c"]
    else:
        steel_pd.loc[steel_pd["uid"] == uid, "uid"] = [uid+"a", uid+"b"]

In [None]:
# Write out to csv
steel_pd.to_csv("../../resources/asset-datasets-v4p1/steel_dataset_v4.1.csv",
                index=False)

## Clean up

In [None]:
!rm -fr tmp