# Data Cleaning

In [131]:
import json
from pathlib import Path

In [132]:
SECONDS = 60

DATA_PATH = Path("../data")
ORIGINAL_DATA = DATA_PATH.joinpath("original_data.json")

ITEMS_FILE = DATA_PATH.joinpath("items.json")
MACHINES_FILE = DATA_PATH.joinpath("machines.json")
RECIPES_FILE = DATA_PATH.joinpath("recipes.json")
GENERATORS_FILE = DATA_PATH.joinpath("generators.json")
RESOURCES_FILE = DATA_PATH.joinpath("resources.json")

IGNORE_ITEMS = [
    "alien-dna-capsule",
    "alien-protein",
    "bacon-agaric",
    "beryl-nut",
    "biomass",
    "blade-runners",
    "blue-power-slug",
    "chainsaw",
    "factory-carttm",
    "ficsit-coupon",
    "gas-mask",
    "golden-factory-carttm",
    "hatcher-remains",
    "hazmat-suit",
    "hog-remains",
    "hoverpack",
    "jetpack",
    "leaves",
    "liquid-biofuel",
    "medicinal-inhaler",
    "mercer-sphere",
    "mycelia",
    "nobelisk-detonator",
    "object-scanner",
    "packaged-liquid-biofuel",
    "paleberry",
    "parachute",
    "purple-power-slug",
    "rebar-gun",
    "rifle",
    "solid-biofuel",
    "somersloop",
    "spitter-remains",
    "stinger-remains",
    "wood",
    "xeno-basher",
    "xeno-zapper",
    "yellow-power-slug",
    "zipline",
]

In [133]:
with open(ORIGINAL_DATA) as f:
    data = json.load(f)

In [134]:
data.keys()

dict_keys(['items', 'recipes', 'schematics', 'generators', 'resources', 'miners', 'buildings'])

## Items

In [135]:
all_items = [i for _, i in data["items"].items()]
items = [i for i in all_items if i["slug"] not in IGNORE_ITEMS]
items.sort(key=lambda item: item["slug"])

In [136]:
for item in items:
    del item["icon"]
    del item["description"]
    del item["stackSize"]
    del item["radioactiveDecay"]
    del item["fluidColor"]

    item["sink_points"] = item["sinkPoints"]
    del item["sinkPoints"]

    item["energy_value"] = item["energyValue"]
    del item["energyValue"]


In [137]:
items = {i["slug"]: i for i in items}

In [138]:
item_slug_by_className = {i["className"]: slug for slug, i in items.items()}

## Machines

In [139]:
machine_names = [
    "Desc_AssemblerMk1_C",
    "Desc_Blender_C",
    "Desc_ConstructorMk1_C",
    "Desc_Converter_C",
    "Desc_FoundryMk1_C",
    "Desc_HadronCollider_C",
    "Desc_ManufacturerMk1_C",
    "Desc_OilRefinery_C",
    "Desc_Packager_C",
    "Desc_QuantumEncoder_C",
    "Desc_SmelterMk1_C",
]
machines = [b for _, b in data["buildings"].items() if b["className"] in machine_names]

for machine in machines:
    machine["powerConsumption"] = machine["metadata"]["powerConsumption"]
    del machine["icon"]
    del machine["description"]
    del machine["categories"]
    del machine["buildMenuPriority"]
    del machine["metadata"]
    del machine["size"]

    machine["power_consumption"] = machine["powerConsumption"]
    del machine["powerConsumption"]

In [140]:
machines = {m["slug"]: m for m in machines}

In [141]:
machine_slug_by_className = {b["className"]: slug for slug, b in machines.items()}

In [142]:
machines

{'quantum-encoder': {'slug': 'quantum-encoder',
  'name': 'Quantum Encoder',
  'className': 'Desc_QuantumEncoder_C',
  'power_consumption': 0},
 'converter': {'slug': 'converter',
  'name': 'Converter',
  'className': 'Desc_Converter_C',
  'power_consumption': 0},
 'refinery': {'slug': 'refinery',
  'name': 'Refinery',
  'className': 'Desc_OilRefinery_C',
  'power_consumption': 30},
 'foundry': {'slug': 'foundry',
  'name': 'Foundry',
  'className': 'Desc_FoundryMk1_C',
  'power_consumption': 16},
 'packager': {'slug': 'packager',
  'name': 'Packager',
  'className': 'Desc_Packager_C',
  'power_consumption': 10},
 'manufacturer': {'slug': 'manufacturer',
  'name': 'Manufacturer',
  'className': 'Desc_ManufacturerMk1_C',
  'power_consumption': 55},
 'assembler': {'slug': 'assembler',
  'name': 'Assembler',
  'className': 'Desc_AssemblerMk1_C',
  'power_consumption': 15},
 'particle-accelerator': {'slug': 'particle-accelerator',
  'name': 'Particle Accelerator',
  'className': 'Desc_Hadr

## Recipes

In [143]:
recipes = [r for _, r in data["recipes"].items()]
recipes = [r for r in recipes if r["inMachine"]]

excluded_slugs = [
    "biofuel",
    "alternate-coal",
    "protein",
    "biomass",
    "powercrystalshard",
    "aliendnacapsule",
    "nobeliskgas",
    "recipe-fabric-c",
]
recipes = [
    r for r in recipes if not any(excluded in r["slug"] for excluded in excluded_slugs)
]


for recipe in recipes:
    recipe["slug"] = recipe["slug"].rstrip("-c")

    del recipe["inHand"]
    del recipe["forBuilding"]
    del recipe["inWorkshop"]
    del recipe["inMachine"]
    del recipe["manualTimeMultiplier"]

    for ingredient in recipe["ingredients"]:
        ingredient_className = ingredient["item"]
        ingredient["item"] = item_slug_by_className.get(ingredient_className)
        ingredient["amount_per_minute"] = (
            ingredient["amount"] * SECONDS / recipe["time"]
        )

    for product in recipe["products"]:
        product_className = product["item"]
        product["item"] = item_slug_by_className.get(product_className)
        product["amount_per_minute"] = product["amount"] * SECONDS / recipe["time"]

    recipe["produced_in"] = machine_slug_by_className.get(recipe["producedIn"][0])
    del recipe["producedIn"]

    del recipe["className"]

    recipe["is_variable_power"] = recipe["isVariablePower"]
    del recipe["isVariablePower"]
    recipe["min_power"] = recipe["minPower"]
    del recipe["minPower"]
    recipe["max_power"] = recipe["maxPower"]
    del recipe["maxPower"]


In [144]:
recipes = {r["slug"]: r for r in recipes}

In [145]:
recipes

{'recipe-ironplate': {'slug': 'recipe-ironplate',
  'name': 'Iron Plate',
  'alternate': False,
  'time': 6.0,
  'ingredients': [{'item': 'iron-ingot',
    'amount': 3.0,
    'amount_per_minute': 30.0}],
  'products': [{'item': 'iron-plate',
    'amount': 2.0,
    'amount_per_minute': 20.0}],
  'produced_in': 'constructor',
  'is_variable_power': False,
  'min_power': 0,
  'max_power': 1},
 'recipe-ironrod': {'slug': 'recipe-ironrod',
  'name': 'Iron Rod',
  'alternate': False,
  'time': 4.0,
  'ingredients': [{'item': 'iron-ingot',
    'amount': 1.0,
    'amount_per_minute': 15.0}],
  'products': [{'item': 'iron-rod', 'amount': 1.0, 'amount_per_minute': 15.0}],
  'produced_in': 'constructor',
  'is_variable_power': False,
  'min_power': 0,
  'max_power': 1},
 'recipe-ingotiron': {'slug': 'recipe-ingotiron',
  'name': 'Iron Ingot',
  'alternate': False,
  'time': 2.0,
  'ingredients': [{'item': 'iron-ore',
    'amount': 1.0,
    'amount_per_minute': 30.0}],
  'products': [{'item': 'iro

## Generators

In [146]:
generators = [g for _, g in data["generators"].items()][:-1]

In [147]:
def slugify_generator_name(classname: str) -> str:
    classname = classname.lstrip("Desc_")
    classname = classname.rstrip("_C")
    new_classname = ""
    for c in classname:
        if c.isupper():
            new_classname += "-" + c.lower()
        else:
            new_classname += c

    return new_classname[1:]

In [148]:
for generator in generators:
    generator["slug"] = slugify_generator_name(generator["className"])
    del generator["className"]
    generator["fuel"] = [
        item_slug_by_className.get(fuel)
        for fuel in generator["fuel"]
        if item_slug_by_className.get(fuel) is not None
    ]
    del generator["powerProductionExponent"]

    generator["power_production"] = generator["powerProduction"]
    del generator["powerProduction"]

    generator["water_to_power_ratio"] = generator["waterToPowerRatio"]
    del generator["waterToPowerRatio"]

generators

[{'fuel': ['coal', 'compacted-coal', 'petroleum-coke'],
  'slug': 'generator-coal',
  'power_production': 75,
  'water_to_power_ratio': 10.0},
 {'fuel': ['uranium-fuel-rod', 'plutonium-fuel-rod', 'ficsonium-fuel-rod'],
  'slug': 'generator-nuclear',
  'power_production': 2500,
  'water_to_power_ratio': 1.6},
 {'fuel': ['fuel', 'turbofuel', 'rocket-fuel', 'ionized-fuel'],
  'slug': 'generator-fuel',
  'power_production': 250,
  'water_to_power_ratio': 0.0}]

In [149]:
generators = {g["slug"]: g for g in generators}

In [150]:
generators

{'generator-coal': {'fuel': ['coal', 'compacted-coal', 'petroleum-coke'],
  'slug': 'generator-coal',
  'power_production': 75,
  'water_to_power_ratio': 10.0},
 'generator-nuclear': {'fuel': ['uranium-fuel-rod',
   'plutonium-fuel-rod',
   'ficsonium-fuel-rod'],
  'slug': 'generator-nuclear',
  'power_production': 2500,
  'water_to_power_ratio': 1.6},
 'generator-fuel': {'fuel': ['fuel',
   'turbofuel',
   'rocket-fuel',
   'ionized-fuel'],
  'slug': 'generator-fuel',
  'power_production': 250,
  'water_to_power_ratio': 0.0}}

## Resources

In [151]:
resources = {
    "bauxite": {"name": "Bauxite", "amount": 12300},
    "caterium-ore": {"name": "Caterium Ore", "amount": 15000},
    "coal": {"name": "Coal", "amount": 42300},
    "copper": {"name": "Copper", "amount": 36900},
    "crude-oil": {"name": "Crude Oil", "amount": 12600},
    "iron-ore": {"name": "Iron Ore", "amount": 92100},
    "limestone": {"name": "Limestone", "amount": 69900},
    "nitrogen-gas": {"name": "Nitrogen Gas", "amount": 12000},
    "raw-quartz": {"name": "Raw Quartz", "amount": 13500},
    "sulfur": {"name": "Sulfur", "amount": 10800},
    "uranium": {"name": "Uranium", "amount": 2100},
    "sam": {"name": "SAM", "amount": 10200},
}
total = sum([r["amount"] for _, r in resources.items()])


for _, resource in resources.items():
    resource["percentage_of_total"] = resource["amount"] / total
    resource["rarity"] = total / resource["amount"]

In [152]:
resources

{'bauxite': {'name': 'Bauxite',
  'amount': 12300,
  'percentage_of_total': 0.0373066424021838,
  'rarity': 26.804878048780488},
 'caterium-ore': {'name': 'Caterium Ore',
  'amount': 15000,
  'percentage_of_total': 0.04549590536851683,
  'rarity': 21.98},
 'coal': {'name': 'Coal',
  'amount': 42300,
  'percentage_of_total': 0.12829845313921748,
  'rarity': 7.794326241134752},
 'copper': {'name': 'Copper',
  'amount': 36900,
  'percentage_of_total': 0.11191992720655142,
  'rarity': 8.934959349593496},
 'crude-oil': {'name': 'Crude Oil',
  'amount': 12600,
  'percentage_of_total': 0.03821656050955414,
  'rarity': 26.166666666666668},
 'iron-ore': {'name': 'Iron Ore',
  'amount': 92100,
  'percentage_of_total': 0.27934485896269334,
  'rarity': 3.579804560260586},
 'limestone': {'name': 'Limestone',
  'amount': 69900,
  'percentage_of_total': 0.21201091901728844,
  'rarity': 4.716738197424893},
 'nitrogen-gas': {'name': 'Nitrogen Gas',
  'amount': 12000,
  'percentage_of_total': 0.03639672

## Final Cleanup

In [153]:
for _, item in items.items():
    del item["slug"]
    del item["className"]

for _, machine in machines.items():
    del machine["slug"]
    del machine["className"]

for _, recipe in recipes.items():
    del recipe["slug"]

for _, generator in generators.items():
    del generator["slug"]

## Save to file

In [154]:
with open(ITEMS_FILE, "w") as f:
    json.dump(items, f, indent=4)

In [155]:
with open(MACHINES_FILE, "w") as f:
    json.dump(machines, f, indent=4)

In [156]:
with open(RECIPES_FILE, "w") as f:
    json.dump(recipes, f, indent=4)

In [157]:
with open(GENERATORS_FILE, "w") as f:
    json.dump(generators, f, indent=4)

In [158]:
with open(RESOURCES_FILE, "w") as f:
    json.dump(resources, f, indent=4)