In [66]:
import json

In [67]:
with open('../data_gloss/kubernetes_docs_combined.json', 'rt') as f_in:
    documents = json.load(f_in)

In [70]:
import hashlib

def generate_document_id(doc):
    chunk_id = doc['title']
    hash_object = hashlib.md5(chunk_id.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [71]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [72]:
documents[3]

{'title': 'Dynamic Resource Allocation',
 'text': 'A Kubernetes feature that lets you request and share resources among Pods.\nThese resources are often attached\n like hardware\naccelerators.\n\n<!--more-->\n\nWith DRA, device drivers and cluster admins define device _classes_ that are\navailable to _claim_ in workloads. Kubernetes allocates matching devices to\nspecific claims and places the corresponding Pods on nodes that can access the\nallocated devices.',
 'source_file': 'dra.md',
 'id': '6a399164'}

In [73]:
from collections import defaultdict

In [74]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [75]:
len(hashes), len(documents)

(228, 229)

In [76]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

0f12ee5c 2


In [77]:
hashes['0f12ee5c']

[{'title': 'kubectl',
  'text': 'kubectl controls the Kubernetes cluster manager.\n\nFind more information in [Command line tool](/docs/reference/kubectl/) (`kubectl`).\n\n```shell\nkubectl [flags]\n```',
  'source_file': 'kubectl.md',
  'id': '0f12ee5c'},
 {'title': 'kubectl',
  'text': 'kubectl controls the Kubernetes cluster manager.\n\n Find more information at: https://kubernetes.io/docs/reference/kubectl/\n\n```\nkubectl [flags]\n```',
  'source_file': 'kubectl.md',
  'id': '0f12ee5c'}]

In [78]:
with open('../data_gloss/docs-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [79]:
!head ../data_gloss/docs-ids.json

[
  {
    "title": "Label",
    "text": "Tags objects with identifying attributes that are meaningful and relevant to users.\n\n<!--more--> \n\nLabels are key/value pairs that are attached to objects such as . They are used to organize and to select subsets of objects.",
    "source_file": "label.md",
    "id": "b021df6a"
  },
  {
    "title": "Probe",
    "text": "A check that the  periodically performs against a container that is \nrunning in a pod, that will define container's state and health and informing container's lifecycle.\n\n<!--more-->\n \nTo learn more, read [container probes](/docs/concepts/workloads/pods/pod-lifecycle/#container-probes).",


In [86]:
prompt_template = """
You are a curious student learning Kubernetes. 
Based on the given chunk of Kubernetes documentation, formulate 3-5 questions a student might ask to understand the content. 
Each question should be complete, clear, and answerable using only the information in the text. 
Avoid copying long phrases from the text; use your own words when possible.

Text content:
{text}

Provide the output as parsable JSON without any code blocks, like this:

["question1", "question2", "question3"]
""".strip()

In [87]:
from openai import OpenAI
client = OpenAI()

In [88]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [89]:
from tqdm.auto import tqdm

In [90]:
results = {}

In [91]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [08:02<00:00,  2.11s/it]


In [92]:
len(results)

228

In [93]:
with open("../data_gloss/ground-truth-dataset.json", 'w') as f:
    json.dump(results, f, indent=4)

print(f"Dictionary successfully saved.")

Dictionary successfully saved.


In [94]:
import pickle

In [103]:
with open('../data_gloss/ground-truth-dataset-clean.json', 'rt') as f_in:
    results = json.load(f_in)

In [104]:
results['8e9db3d1']

["What defines immutable infrastructure, and why can't it be changed after deployment?",
 'How does immutability help in identifying and mitigating security risks?',
 'What role does automation play in enforcing immutability within an infrastructure?',
 'How are containers an example of immutable infrastructure?',
 'What is the relationship between immutable infrastructure and infrastructure as code?']

In [105]:
results['76d16796']

['What is the purpose of a check that runs periodically on a container in a pod?',
 'How does the check influence the lifecycle of a container?',
 'Where can you find more detailed information about container probes?']

In [102]:
import json

with open("../data_gloss/ground-truth-dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

cleaned = {}
for key, value in data.items():
    try:
        # Parse the stringified list into a real list
        questions = json.loads(value)
        # Strip whitespace/newlines from each question
        questions = [q.strip() for q in questions]
        cleaned[key] = questions
    except Exception as e:
        print(f"Error parsing key {key}: {e}")
        cleaned[key] = []

# Save the cleaned JSON
with open("../data_gloss/ground-truth-dataset-clean.json", "w", encoding="utf-8") as f:
    json.dump(cleaned, f, indent=4, ensure_ascii=False)

print("✅ Cleaned questions saved to questions_clean.json")


✅ Cleaned questions saved to questions_clean.json


In [107]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json_questions

In [110]:
parsed_results

{'b021df6a': ['What are labels used for in Kubernetes?',
  'How are labels structured in Kubernetes?',
  'What is the role of key/value pairs in labeling objects?',
  'Why might users find labels meaningful and relevant?',
  'Which Kubernetes objects can have labels attached to them?'],
 '76d16796': ['What is the purpose of a check that runs periodically on a container in a pod?',
  'How does the check influence the lifecycle of a container?',
  'Where can you find more detailed information about container probes?'],
 '0ad333e2': ['What is the primary purpose of a LimitRange in Kubernetes?',
  'How does a LimitRange help manage resources within a namespace?',
  'In what ways can a LimitRange restrict resources for containers or Pods?',
  'Can a LimitRange affect the number of certain resource types created?'],
 '6a399164': ['What is the purpose of the device classes defined by device drivers and cluster admins in Kubernetes?',
  'How does Kubernetes handle allocation of devices to spec

In [108]:
doc_index = {d['id']: d for d in documents}

In [109]:
doc_index

{'b021df6a': {'title': 'Label',
  'text': 'Tags objects with identifying attributes that are meaningful and relevant to users.\n\n<!--more--> \n\nLabels are key/value pairs that are attached to objects such as . They are used to organize and to select subsets of objects.',
  'source_file': 'label.md',
  'id': 'b021df6a'},
 '76d16796': {'title': 'Probe',
  'text': "A check that the  periodically performs against a container that is \nrunning in a pod, that will define container's state and health and informing container's lifecycle.\n\n<!--more-->\n \nTo learn more, read [container probes](/docs/concepts/workloads/pods/pod-lifecycle/#container-probes).",
  'source_file': 'probe.md',
  'id': '76d16796'},
 '0ad333e2': {'title': 'LimitRange',
  'text': 'Constraints resource consumption per  or ,\nspecified for a particular .\n\n<!--more--> \n\nA [LimitRange](/docs/concepts/policy/limit-range/) either limits the quantity of \nthat can be created (for a particular resource type),\nor the amo

In [115]:
final_results = []

for doc_id, questions in parsed_results.items():
    title = doc_index[doc_id]['title']
    for q in questions:
        final_results.append((q, title, doc_id))

In [116]:
final_results

[('What are labels used for in Kubernetes?', 'Label', 'b021df6a'),
 ('How are labels structured in Kubernetes?', 'Label', 'b021df6a'),
 ('What is the role of key/value pairs in labeling objects?',
  'Label',
  'b021df6a'),
 ('Why might users find labels meaningful and relevant?', 'Label', 'b021df6a'),
 ('Which Kubernetes objects can have labels attached to them?',
  'Label',
  'b021df6a'),
 ('What is the purpose of a check that runs periodically on a container in a pod?',
  'Probe',
  '76d16796'),
 ('How does the check influence the lifecycle of a container?',
  'Probe',
  '76d16796'),
 ('Where can you find more detailed information about container probes?',
  'Probe',
  '76d16796'),
 ('What is the primary purpose of a LimitRange in Kubernetes?',
  'LimitRange',
  '0ad333e2'),
 ('How does a LimitRange help manage resources within a namespace?',
  'LimitRange',
  '0ad333e2'),
 ('In what ways can a LimitRange restrict resources for containers or Pods?',
  'LimitRange',
  '0ad333e2'),
 ('

In [117]:
import pandas as pd

In [118]:
df = pd.DataFrame(final_results, columns=['question', 'title', 'document'])

In [121]:
df.to_csv('../data_gloss/ground-truth-data.csv', index=False)

In [122]:
!head ground-truth-data.csv

question,title,document
What are labels used for in Kubernetes?,Label,b021df6a
How are labels structured in Kubernetes?,Label,b021df6a
What is the role of key/value pairs in labeling objects?,Label,b021df6a
Why might users find labels meaningful and relevant?,Label,b021df6a
Which Kubernetes objects can have labels attached to them?,Label,b021df6a
What is the purpose of a check that runs periodically on a container in a pod?,Probe,76d16796
How does the check influence the lifecycle of a container?,Probe,76d16796
Where can you find more detailed information about container probes?,Probe,76d16796
What is the primary purpose of a LimitRange in Kubernetes?,LimitRange,0ad333e2
