# How to use
Assuming using MacOS or Linux.

## 1. Install & Setup
Install Python 3.11, then run in a new folder:
```bash
python3.11 -m venv .venv
source .venv/bin/activate
python3.11 -m pip install jupyterlab polars altair pyarrow
```

Save this `.ipynb` file in the same folder you created.

## 2. Get Data
Generate data on dev using Rancher console, with the command:
```bash
kubectl get pods --all-namespaces  -o=jsonpath="{range .items[*]}{.metadata.namespace}:{.metadata.name}{'\n'}{range .spec.containers[*]} {.name}:{.resources.requests.cpu}{'\n'}{end}{'\n'}{end}"
```

Save output to a file in the same folder locally.

## 3. Run Jupyter
In terminal, run in your local folder:
```bash
source .venv/bin/activate
jupyter lab
```

Jupyter will now open in your browser, in the Jupyter left sidebar open the `.ipynb` file.

## 4. Run the notebook
Edit the `input_filename` (2 blocks below) to the name of the file you output in step #2.

In the Jupyter menubar at the top, select `Run > Run all Cells` to execute the notebook.

Charts will be created below each code cell.

In [3]:
"""Imports"""
import re

import polars as pl
import polars.selectors as cs
import altair as alt

In [4]:
"""Process the text into a dataframe for cleaning."""

# replace the file name with your output
input_filename = '/Users/sjairam/output0409LZ.txt'
with open(input_filename) as raw:
    # Just read entire file, it's not that big
    raw_text = raw.read()

# Split by a blank line, to get pods & containers
raw_chunks = re.split(r'\n\s*\n', raw_text)
# Convert to dataframe after, can't do regex split easily in Polars
raw_df = pl.DataFrame(data={'raw_chunk': raw_chunks})

raw_df.head()

raw_chunk
str
"""argocd:argo-cd-argocd-applicat…"
"""argocd:argo-cd-argocd-applicat…"
"""argocd:argo-cd-argocd-applicat…"
"""argocd:argo-cd-argocd-dex-serv…"
"""argocd:argo-cd-argocd-dex-serv…"


In [5]:
"""Dataframe cleaning"""
clean_df = raw_df.select(
    # Split each row by newline to get list of: [pod, container_1, ...]
    split_lists=pl.col("raw_chunk").str.split("\n"),
).select(
    # Separate out raw `pod` & `containers` from each inner list
    raw_pod=pl.col("split_lists").list.first(),
    raw_container=pl.col("split_lists").list.slice(1),
).explode(
    # Denormalize containers list
    "raw_container",
).select(
    # Split out the namespace & pod from `raw_pod`
    split_pod=pl.col("raw_pod").str.splitn(":", 2).struct.rename_fields(["namespace", "pod"]),
    # Split out container & CPU from `raw_container`
    split_container=pl.col("raw_container").str.splitn(":", 2).struct.rename_fields(["container", "raw_cpu"]),
).unnest(
    # Separate splits into columns
    "split_pod", "split_container",
).select(
    # Whitespace strip any indented values
    cs.string().as_expr().str.strip_chars(), ~cs.string(),
    # Extract CPU value vs. unit
    cpu_value=pl.col("raw_cpu").str.extract(r"^([^A-Za-z]+)").cast(pl.Float64),
    cpu_units=pl.col("raw_cpu").str.extract(r"([A-Za-z]+)$"),
).select(
    # Drop any remaining `raw_` columns
    ~cs.starts_with("raw_"),
    # Convert to milliCPUs
    milli_cpus=pl.when(
        pl.col("cpu_units") == "m"
    ).then(
        pl.col("cpu_value")
    ).when(
        pl.col("cpu_units").is_null()
    ).then(
        pl.col("cpu_value") * 1000.0
    ).otherwise(None)
)

# Materialize final data (ignore null final values)
final_df = clean_df.filter(pl.col("milli_cpus").is_not_null())
final_df.head()

namespace,pod,container,cpu_value,cpu_units,milli_cpus
str,str,str,f64,str,f64
"""cattle-monitoring-system""","""alertmanager-rancher-monitorin…","""alertmanager""",100.0,"""m""",100.0
"""cattle-monitoring-system""","""alertmanager-rancher-monitorin…","""config-reloader""",200.0,"""m""",200.0
"""cattle-monitoring-system""","""prometheus-rancher-monitoring-…","""prometheus""",750.0,"""m""",750.0
"""cattle-monitoring-system""","""prometheus-rancher-monitoring-…","""config-reloader""",200.0,"""m""",200.0
"""cattle-monitoring-system""","""rancher-monitoring-grafana-5f5…","""grafana""",100.0,"""m""",100.0


In [6]:
"""Plot data"""
alt.Chart(final_df).mark_bar().encode( # type: ignore
    x='namespace',
    y='sum(milli_cpus)',
    color='container',
)

In [7]:
alt.Chart(final_df.filter(namespace="drs-pipelines-dev")).mark_bar().encode( # type: ignore
    x='pod',
    y='sum(milli_cpus)',
    color='container',
)

In [8]:
"""Totals"""
total_milli_cpus = final_df.get_column("milli_cpus").sum()
total_milli_cpus / 1000

18.795

In [9]:
alt.Chart(
    final_df.select( # type: ignore
        pl.col("*").exclude("milli_cpus"),
        milli_cpus=(pl.col("milli_cpus") / total_milli_cpus) * 100
    )
).mark_bar().encode(
    x='namespace',
    y='sum(milli_cpus)',
    color='container',
)