In [1]:
import os
import random
import requests
import json
from collections import defaultdict
from uuid import uuid4
from tqdm.auto import tqdm
import pickle

import pandas as pd

from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

True

In [2]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [3]:

#def generate_document_id(doc):
#    # combined = f"{doc['course']}-{doc['question']}"
#    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
#    hash_object = hashlib.md5(combined.encode())
#    hash_hex = hash_object.hexdigest()
#    document_id = hash_hex[:8]
#    return document_id
#
if os.path.exists('documents-with-ids.json'):
    with open('documents-with-ids.json', 'rt') as f_in:
        documents = json.load(f_in)
else:
    docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    documents = []

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            doc['id'] = str(uuid4())
            documents.append(doc)

    with open('documents-with-ids.json', 'wt') as f_out:
        json.dump(documents, f_out, indent=2)


In [4]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Output a JSON object with a single key "questions" whose value is a list of 5 questions, like:
{{"questions": ["Q1", "Q2", "Q3", "Q4", "Q5"]}}
Do not include any extra text, explanation, or formatting.
""".strip()

In [5]:
select_doc = random.choice(documents)

select_doc

{'text': 'If you’re running MLflow on a remote VM, you need to forward the port too like we did in Module 1 for Jupyter notebook port 8888. Simply connect your server to VS Code, as we did, and add 5000 to the PORT like in the screenshot:\nAdded by Sharon Ibejih\nIf you are running MLflow locally and 127.0.0.1:5000 shows a blank page navigate to localhost:5000 instead.',
 'section': 'Module 2: Experiment tracking',
 'question': 'MLflow URL (http://127.0.0.1:5000), doesn’t open.',
 'course': 'mlops-zoomcamp',
 'id': '23ab5ad7-424b-414d-ae7e-218aa95397b0'}

In [6]:
prompt = prompt_template.format(**select_doc)
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: Module 2: Experiment tracking
question: MLflow URL (http://127.0.0.1:5000), doesn’t open.
answer: If you’re running MLflow on a remote VM, you need to forward the port too like we did in Module 1 for Jupyter notebook port 8888. Simply connect your server to VS Code, as we did, and add 5000 to the PORT like in the screenshot:
Added by Sharon Ibejih
If you are running MLflow locally and 127.0.0.1:5000 shows a blank page navigate to localhost:5000 instead.

Output a JSON object with a single key "questions" whose value is a list of 5 questions, like:
{"questions": ["Q1", "Q2", "Q3", "Q4", "Q5"]}
Do not include any extra text, explanation, or formatting.


In [7]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = openai_client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    json_response = response.choices[0].message.content
    json_response = json.loads(json_response)['questions']
    return json_response

In [None]:
if os.path.exists('results.bin'):
    with open('results.bin', 'rb') as f_in:
        results = pickle.load(f_in)
        
else:
    parsed_results = {}
    for doc in tqdm(documents): 
        doc_id = doc['id']
        questions = generate_questions(doc)
        parsed_results[doc_id] = questions

    with open('results.bin','wb') as f_out:
        pickle.dump(parsed_results, f_out, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/948 [00:00<?, ?it/s]

In [9]:
parsed_documents = {d['id']: d for d in documents}

In [10]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = parsed_documents[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [11]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])
df.to_csv('ground-truth-data.csv', index=False)

df.head()

Unnamed: 0,question,course,document
0,When will the course start?,data-engineering-zoomcamp,a7d2342a-f84f-4e9a-8df5-693a11b0a676
1,What is the exact date and time the course beg...,data-engineering-zoomcamp,a7d2342a-f84f-4e9a-8df5-693a11b0a676
2,How can I subscribe to the course calendar?,data-engineering-zoomcamp,a7d2342a-f84f-4e9a-8df5-693a11b0a676
3,What should I do before the course begins?,data-engineering-zoomcamp,a7d2342a-f84f-4e9a-8df5-693a11b0a676
4,Which platform should I join for course announ...,data-engineering-zoomcamp,a7d2342a-f84f-4e9a-8df5-693a11b0a676


In [12]:
parsed_results

{'a7d2342a-f84f-4e9a-8df5-693a11b0a676': ['When will the course start?',
  'What is the exact date and time the course begins?',
  'How can I subscribe to the course calendar?',
  'What should I do before the course begins?',
  'Which platform should I join for course announcements?'],
 '5fa2407d-bbe6-46b4-920b-a9f44e25cd2e': ['What are the prerequisites for this course?',
  'Where are the prerequisites listed?',
  'Can you tell me about the course prerequisites?',
  'How can I find the course prerequisites?',
  'What do I need to know before starting this course?'],
 '842f74bf-c5d7-40ac-9cc4-4299ea9697b5': ['Is it possible to enroll after the course has started?',
  'Can I submit assignments without registering for the course initially?',
  'Are there any deadlines for the final project submission after joining late?',
  'What should I be cautious of if joining the course post start-date?',
  'Can I delay all coursework until close to deadlines after joining the course late?'],
 'f538