diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml new file mode 100644 index 0000000..fbfcc21 --- /dev/null +++ b/.github/workflows/docker.yaml @@ -0,0 +1,56 @@ +name: Docker image +# Run the tasks on every push +on: push +jobs: + build_push_api: + name: Build and push execution environment + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Set up Docker layer caching + uses: actions/cache@v2 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx- + - name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Calculate metadata for image + id: image-meta + uses: docker/metadata-action@v3 + with: + images: ghcr.io/stackhpc/os-capacity + # Produce the branch name or tag and the SHA as tags + tags: | + type=ref,event=branch + type=ref,event=tag + type=sha,prefix= + - name: Build and push image + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: ${{ steps.image-meta.outputs.tags }} + labels: ${{ steps.image-meta.outputs.labels }} + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,dest=/tmp/.buildx-cache-new,mode=max + + # Temp fix + # https://github.com/docker/build-push-action/issues/252 + # https://github.com/moby/buildkit/issues/1896 + # https://github.com/docker/buildx/pull/535 + - name: Move cache + run: | + rm -rf /tmp/.buildx-cache + mv /tmp/.buildx-cache-new /tmp/.buildx-cache diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..83bcbea --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM ubuntu:22.04 + +RUN apt-get update && apt-get upgrade -y && apt-get install python3-pip tini -y && apt-get clean + +COPY ./requirements.txt /opt/os-capacity/requirements.txt +RUN pip install -U -r /opt/os-capacity/requirements.txt + +COPY ./os_capacity/prometheus.py /opt/os-capacity/prometheus.py +ENTRYPOINT ["tini", "--"] +CMD ["python3", "/opt/os-capacity/prometheus.py"] diff --git a/README.rst b/README.rst index d26cbad..aba4356 100644 --- a/README.rst +++ b/README.rst @@ -3,10 +3,6 @@ os-capacity This is a prototype tool to extract capacity information. -.. note:: - - This is currently quite specific to Ironic powered OpenStack Nova clouds. - Install ------- @@ -21,139 +17,28 @@ Now lets get that installed inside a virtual environment: .. code:: - virtualenv .venv-test - source .venv-test/bin/activate + python3 -m virtualenv .venv + source .venv/bin/activate pip install -U . Prometheus Exporter ------------------- -Assuming you have clouds.yaml in the right place and OS_CLOUD set: - -.. code:: - - ./os_capacity/prometheus.py - openstack_total_capacity_per_flavor{flavor="small"} 1 - openstack_capacity_by_hostname{hypervisor="aio",flavor="small"} 1 - - -TODOs we need support for: - -* add request filter support for require_tenant_aggregate, - map_az_to_placement_aggregate and compute_status_filter - -Configuration -------------- - -The easiest way to configure this is to populate a typical OpenStack RC file: - -.. code:: - - cat > .openrc < mytestrun + cat mytestrun - optional arguments: - --version show program's version number and exit - -v, --verbose Increase verbosity of output. Can be repeated. - -q, --quiet Suppress output except warnings and errors. - --log-file LOG_FILE Specify a file to log output. Disabled by default. - -h, --help Show help message and exit. - --debug Show tracebacks on errors. +Or just run via docker or similar::: - Commands: - complete print bash completion command - flavor list List all the flavors. - help print detailed help for another command - prometheus To be run as node exporter textfile collector - resources all List all resource providers, with their resources and servers. - resources group Lists counts of resource providers with similar inventories. - usages all List all current resource usages. - usages group Group usage by specified key (by user or project). + docker run -d --name os_capacity \ + --mount type=bind,source=/etc/openstack/,target=/etc/openstack/ \ + --env OS_CLOUD=openstack --env OS_CLIENT_CONFIG_FILE=/etc/openstack/mycloud.yaml \ + -p 9000:9000 ghcr.io/stackhpc/os-capacity:master diff --git a/os_capacity/prometheus.py b/os_capacity/prometheus.py index 5c97e6b..ade0478 100755 --- a/os_capacity/prometheus.py +++ b/os_capacity/prometheus.py @@ -2,16 +2,19 @@ import collections import json +import time +import uuid import openstack +import prometheus_client as prom_client +from prometheus_client import core as prom_core def get_capacity_per_flavor(placement_client, flavors): capacity_per_flavor = {} for flavor in flavors: - resources, traits = get_placement_request(flavor) - max_per_host = get_max_per_host(placement_client, resources, traits) + max_per_host = get_max_per_host(placement_client, flavor) capacity_per_flavor[flavor.name] = max_per_host return capacity_per_flavor @@ -46,7 +49,8 @@ def add_defaults(resources, flavor, skip_vcpu=False): return resources, required_traits -def get_max_per_host(placement_client, resources, required_traits): +def get_max_per_host(placement_client, flavor): + resources, required_traits = get_placement_request(flavor) resource_str = ",".join( [key + ":" + str(value) for key, value in resources.items() if value] ) @@ -80,7 +84,7 @@ def get_max_per_host(placement_client, resources, required_traits): if max_counts: count_per_rp[rp_uuid] = min(max_counts) if not count_per_rp: - print(f"# WARNING - no candidates for: {params}") + print(f"# WARNING - no candidates hosts for flavor: {flavor.name} {params}") return count_per_rp @@ -136,18 +140,29 @@ def get_resource_provider_info(compute_client, placement_client): return resource_providers, project_to_aggregate -def print_details(compute_client, placement_client): +def get_host_details(compute_client, placement_client): flavors = list(compute_client.flavors()) capacity_per_flavor = get_capacity_per_flavor(placement_client, flavors) # total capacity per flavor + free_by_flavor_total = prom_core.GaugeMetricFamily( + "openstack_free_capacity_by_flavor_total", + "Free capacity if you fill the cloud full of each flavor", + labels=["flavor_name"], + ) flavor_names = sorted([f.name for f in flavors]) for flavor_name in flavor_names: counts = capacity_per_flavor.get(flavor_name, {}).values() total = 0 if not counts else sum(counts) - print(f'openstack_total_capacity_per_flavor{{flavor="{flavor_name}"}} {total}') + free_by_flavor_total.add_metric([flavor_name], total) + # print(f'openstack_free_capacity_by_flavor{{flavor="{flavor_name}"}} {total}') # capacity per host + free_by_flavor_hypervisor = prom_core.GaugeMetricFamily( + "openstack_free_capacity_hypervisor_by_flavor", + "Free capacity for each hypervisor if you fill remaining space full of each flavor", + labels=["hypervisor", "flavor_name", "az_aggregate", "project_aggregate"], + ) resource_providers, project_to_aggregate = get_resource_provider_info( compute_client, placement_client ) @@ -161,31 +176,161 @@ def print_details(compute_client, placement_client): our_count = all_counts.get(rp_id, 0) if our_count == 0: continue - host_str = f'hypervisor="{hostname}"' - az = rp.get("az") - if az: - host_str += f',az="{az}"' - project_filter = rp.get("project_filter") - if project_filter: - host_str += f',project_filter="{project_filter}"' - print( - f'openstack_capacity_by_hostname{{{host_str},flavor="{flavor_name}"}} {our_count}' + az = rp.get("az", "") + project_filter = rp.get("project_filter", "") + free_by_flavor_hypervisor.add_metric( + [hostname, flavor_name, az, project_filter], our_count ) free_space_found = True if not free_space_found: + # TODO(johngarbutt) allocation candidates only returns some not all candidates! print(f"# WARNING - no free spaces found for {hostname}") + project_filter_aggregates = prom_core.GaugeMetricFamily( + "openstack_project_filter_aggregate", + "Mapping of project_ids to aggregates in the host free capacity info.", + labels=["project_id", "aggregate"], + ) for project, names in project_to_aggregate.items(): for name in names: - print( - f'openstack_project_filter{{project="{project}",aggregate="{name}"}} 1' - ) + project_filter_aggregates.add_metric([project, name], 1) + # print( + # f'openstack_project_filter_aggregate{{project_id="{project}",aggregate="{name}"}} 1' + # ) + return resource_providers, [ + free_by_flavor_total, + free_by_flavor_hypervisor, + project_filter_aggregates, + ] + + +def get_project_usage(indentity_client, placement_client, compute_client): + projects = {proj.id: dict(name=proj.name) for proj in indentity_client.projects()} + for project_id in projects.keys(): + # TODO(johngarbutt) On Xena we should do consumer_type=INSTANCE using 1.38! + response = placement_client.get( + f"/usages?project_id={project_id}", + headers={"OpenStack-API-Version": "placement 1.19"}, + ) + response.raise_for_status() + usages = response.json() + projects[project_id]["usages"] = usages["usages"] + + response = compute_client.get( + f"/os-quota-sets/{project_id}", + headers={"OpenStack-API-Version": "compute 2.20"}, + ) + response.raise_for_status() + quotas = response.json().get("quota_set", {}) + projects[project_id]["quotas"] = dict( + CPUS=quotas.get("cores"), MEMORY_MB=quotas.get("ram") + ) + # print(json.dumps(projects, indent=2)) + + project_usage_guage = prom_core.GaugeMetricFamily( + "openstack_project_usage", + "Current placement allocations per project.", + labels=["project_id", "project_name", "placement_resource"], + ) + project_quota_guage = prom_core.GaugeMetricFamily( + "openstack_project_quota", + "Current quota set to limit max resource allocations per project.", + labels=["project_id", "project_name", "quota_resource"], + ) + for project_id, data in projects.items(): + name = data["name"] + project_usages = data["usages"] + for resource, amount in project_usages.items(): + project_usage_guage.add_metric([project_id, name, resource], amount) + + if not project_usages: + # skip projects with zero usage? + print(f"# WARNING no usage for project: {name} {project_id}") + continue + project_quotas = data["quotas"] + for resource, amount in project_quotas.items(): + project_quota_guage.add_metric([project_id, name, resource], amount) + return [project_usage_guage, project_quota_guage] + + +def get_host_usage(resource_providers, placement_client): + usage_guage = prom_core.GaugeMetricFamily( + "openstack_hypervisor_placement_allocated", + "Currently allocated resource for each provider in placement.", + labels=["hypervisor", "resource"], + ) + capacity_guage = prom_core.GaugeMetricFamily( + "openstack_hypervisor_placement_allocatable_capacity", + "The total allocatable resource in the placement inventory.", + labels=["hypervisor", "resource"], + ) + for name, data in resource_providers.items(): + rp_id = data["uuid"] + response = placement_client.get( + f"/resource_providers/{rp_id}/usages", + headers={"OpenStack-API-Version": "placement 1.19"}, + ) + response.raise_for_status() + rp_usages = response.json()["usages"] + resource_providers[name]["usages"] = rp_usages + + for resource, amount in rp_usages.items(): + usage_guage.add_metric([name, resource], amount) + + response = placement_client.get( + f"/resource_providers/{rp_id}/inventories", + headers={"OpenStack-API-Version": "placement 1.19"}, + ) + response.raise_for_status() + inventories = response.json()["inventories"] + resource_providers[name]["inventories"] = inventories + + for resource, data in inventories.items(): + amount = int(data["total"] * data["allocation_ratio"]) - data["reserved"] + capacity_guage.add_metric([name, resource], amount) + # print(json.dumps(resource_providers, indent=2)) + return [usage_guage, capacity_guage] def print_exporter_data(app): - print_details(app.compute_client, app.placement_client) + print_host_free_details(app.compute_client, app.placement_client) + + +class OpenStackCapacityCollector(object): + def __init__(self): + self.conn = openstack.connect() + openstack.enable_logging(debug=False) + print("got openstack connection") + # for some reason this makes the logging work?! + self.conn.compute.flavors() + + def collect(self): + start_time = time.perf_counter() + collect_id = uuid.uuid4().hex + print(f"Collect started {collect_id}") + guages = [] + + conn = openstack.connect() + openstack.enable_logging(debug=False) + try: + resource_providers, host_guages = get_host_details( + conn.compute, conn.placement + ) + guages += host_guages + guages += get_project_usage(conn.identity, conn.placement, conn.compute) + guages += get_host_usage(resource_providers, conn.placement) + except Exception as e: + print(f"error {e}") + + end_time = time.perf_counter() + duration = end_time - start_time + print(f"Collect complete {collect_id} it took {duration} seconds") + return guages if __name__ == "__main__": - conn = openstack.connect() - print_details(conn.compute, conn.placement) + prom_client.start_http_server(9000) + prom_core.REGISTRY.register(OpenStackCapacityCollector()) + # there must be a better way! + while True: + time.sleep(5000) diff --git a/requirements.txt b/requirements.txt index 351edb0..ef00cfa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ cliff>=2.8.0 # Apache os-client-config>=1.28.0 # Apache-2.0 pbr>=2.0.0,!=2.1.0 # Apache-2.0 +prometheus-client==0.16.0