In [5]:
import pandas as pd
from pathlib import Path

# ---------- CONFIG ----------
input_csv_path = Path("get-goals.csv")  # <-- change to your CSV file name/path
output_txt_path = Path("okr_goals_llm.txt")
# -----------------------------


def make_tag_name(column_name: str) -> str:
    """
    Turn a column name like 'Goal name' or 'KR Status Key' into
    a tag-friendly name like 'goal_name' or 'kr_status_key'.
    """
    return (
        column_name
        .strip()
        .lower()
        .replace(" ", "_")
        .replace("'", "")       # remove apostrophes
        .replace("(", "")       # remove parentheses
        .replace(")", "")
        .replace("/", "_")      # handle weird chars if they appear
    )


# Read the CSV
df = pd.read_csv(input_csv_path)

# Precompute tag names for columns
tag_names = {col: make_tag_name(col) for col in df.columns}

lines = []

for _, row in df.iterrows():
    lines.append("<record>")

    for col in df.columns:
        tag = tag_names[col]
        value = row[col]

        # Convert NaN / None to empty string
        if pd.isna(value):
            value_str = ""
        else:
            value_str = str(value)

        # Simple escaping for angle brackets so we don't break the "XML-ish" format
        value_str = value_str.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

        lines.append(f"  <{tag}>{value_str}</{tag}>")

    lines.append("</record>")
    lines.append("")  # blank line between records for readability

# Write to file
output_txt_path.write_text("\n".join(lines), encoding="utf-8")

print(f"Done! Wrote {len(df)} records to {output_txt_path}")

Done! Wrote 8558 records to okr_goals_llm.txt


In [7]:
import tiktoken

# Count tokens in the text
text = output_txt_path.read_text(encoding="utf-8")
encoding = tiktoken.encoding_for_model("gpt-4o")
tokens = encoding.encode(text)
print(f"Number of tokens: {len(tokens)}")


Number of tokens: 4195514
