In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

Vemos que se recuperan los documents, pero estos no tienen un ID asociado. Por lo que usamos la siguiente función para poder generarles una ID única

La ID se genera en base a una combinación entre el tipo de curso, la question y la respuesta ('text'). 
El problema de este método es que si modificamos alguno de estos campos, la ID cambia totalmente.

In [3]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [4]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [5]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

Testeemos qué tan únicas son estas IDs

In [6]:
from collections import defaultdict

In [7]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [8]:
len(hashes), len(documents) # ideally, these should be equal because IDs should be unique

(947, 948)

Chequeamos qué ids no son diferentes

In [26]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values)) # shows the IDs collision

593f7569 2


In [28]:
hashes['593f7569'] # vemos los documentos que colisionan

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [9]:
import json

In [10]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [11]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [12]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [14]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the API key from environment variable
api_key = os.getenv('OPENAI_API_KEY')

In [15]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

Comprobamos que el prompt template anda bien

In [17]:
doc = documents[2]
prompt = prompt_template.format(**doc)

print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [18]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [19]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
results = {}

ESTE PASO SE PUEDE OMITIR, CUESTA COMO 4 USD

In [25]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

100%|██████████| 948/948 [37:30<00:00,  2.37s/it]  


In [22]:
import pickle

In [26]:
import pickle

with open('results.bin', 'wb') as f_out:
    pickle.dump(results, f_out)

Cargar resultados sin tener que llamar de nuevo a OpenAI API

In [27]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [28]:
results['1f6520ca']

'["What prior knowledge is needed for this course?", \n"Which repository lists the prerequisites for the course?", \n"Can you point me to the specific GitHub prerequisites for this course?", \n"Where can I find details on the necessary background for this course?", \n"What should I know before enrolling in this course?"]'

In [29]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [30]:
doc_index = {d['id']: d for d in documents}

In [31]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [32]:
import pandas as pd

In [33]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [34]:
df.to_csv('ground-truth-data.csv', index=False)

In [35]:
!head ground-truth-data.csv

question,course,document
When exactly does the course start?,data-engineering-zoomcamp,c02e79ef
How do I register for the course before it begins?,data-engineering-zoomcamp,c02e79ef
Where can I find the course schedule?,data-engineering-zoomcamp,c02e79ef
What should I do to join the course announcements?,data-engineering-zoomcamp,c02e79ef
How can I connect with others in the course via Slack?,data-engineering-zoomcamp,c02e79ef
What prior knowledge is needed for this course?,data-engineering-zoomcamp,1f6520ca
Which repository lists the prerequisites for the course?,data-engineering-zoomcamp,1f6520ca
Can you point me to the specific GitHub prerequisites for this course?,data-engineering-zoomcamp,1f6520ca
Where can I find details on the necessary background for this course?,data-engineering-zoomcamp,1f6520ca
