In [114]:
import hashlib

In [115]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [116]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [117]:
def unique_uuid(document):
    unique_uuid = f"{document['text'][:10]}-{document['question']}-{document['course']}"
    hash_object = hashlib.md5(unique_uuid.encode())
    hash_hex = hash_object.hexdigest()[:8]
    return hash_hex

In [118]:
unique_uuid(documents[1])

'e3e0b8eb'

In [119]:
for doc in documents:
    doc['id'] = unique_uuid(doc)

In [120]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': 'e3e0b8eb'}

In [121]:
from collections import defaultdict
hashes = defaultdict(list)

In [122]:
for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [123]:
len(hashes), len(documents)

(947, 948)

In [124]:
for key, value in hashes.items():
    if len(value) > 1:
        print(key)

33646674


In [125]:
# find the colission after hash implemetation
hashes['33646674']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '33646674'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '33646674'}]

In [126]:
import json 
with open ('documents-with-id.json', 'w') as file:
    json.dump(documents, file, indent = 2)

In [127]:
!head documents-with-id.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "2f401745"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [128]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [129]:
from openai import OpenAI
client = OpenAI()
doc = documents[2] 

In [130]:
prompt = prompt_template.format(**doc)

In [131]:
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [135]:
response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}])

json_response = response.choices[0].message.content
json_response

'["Is it possible to join the course after it has already started?",\n"Can I submit homeworks if I join the course late?",\n"Are there deadlines for turning in final projects?",\n"Will registering late affect my ability to submit assignments?",\n"What should I be cautious of if I register after the course starts?"]'

In [136]:
json.loads(json_response)

['Is it possible to join the course after it has already started?',
 'Can I submit homeworks if I join the course late?',
 'Are there deadlines for turning in final projects?',
 'Will registering late affect my ability to submit assignments?',
 'What should I be cautious of if I register after the course starts?']

In [137]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}])
    return response.choices[0].message.content

In [142]:
from tqdm import tqdm
results = {}
for doc in tqdm(documents):
    doc_id = doc['id']
    prompt = prompt_template.format(**doc)
    questions = generate_questions(doc)
    results[doc_id] = questions

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [41:12<00:00,  2.61s/it]


In [223]:
results

{'2f401745': '["When is the course start date and time?", "How can I subscribe to the course calendar?", "Where should I register before the course starts?", "How can I join the course announcements channel?", "Which Slack channel should I join for the course?"]',
 'e3e0b8eb': '[\n    "What are the prerequisites for this course?",\n    "Where can I find the prerequisites for this course?",\n    "Do I need any knowledge before enrolling in this course?",\n    "Is there a list of skills needed for this course?",\n    "How can I check what I need to know before taking this course?"\n]',
 'da77a135': '["Is it possible to register and join the course after it has already started?",\n"Can I submit homework if I haven\'t registered for the course?",\n"Am I allowed to turn in the final projects late if I join the course after it has started?",\n"Are there deadlines for the final projects if I join the course late?",\n"If I miss the start date, can I still participate in the course\'s activitie

In [226]:
import pickle

with open('results.bin', 'wb') as f_out:
    pickle.dump(results, f_out)

In [227]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [232]:
results['2f401745']

'["When is the course start date and time?", "How can I subscribe to the course calendar?", "Where should I register before the course starts?", "How can I join the course announcements channel?", "Which Slack channel should I join for the course?"]'

In [257]:
parsed_results_updated_json = {}

for doc_id, json_questions in results.items():
    if doc_id in ['56578299', '98d4bbc8']:
        continue
    else:
        parsed_results_updated_json[doc_id] = json.loads(json_questions)


In [258]:
parsed_results_updated_json

{'2f401745': ['When is the course start date and time?',
  'How can I subscribe to the course calendar?',
  'Where should I register before the course starts?',
  'How can I join the course announcements channel?',
  'Which Slack channel should I join for the course?'],
 'e3e0b8eb': ['What are the prerequisites for this course?',
  'Where can I find the prerequisites for this course?',
  'Do I need any knowledge before enrolling in this course?',
  'Is there a list of skills needed for this course?',
  'How can I check what I need to know before taking this course?'],
 'da77a135': ['Is it possible to register and join the course after it has already started?',
  "Can I submit homework if I haven't registered for the course?",
  'Am I allowed to turn in the final projects late if I join the course after it has started?',
  'Are there deadlines for the final projects if I join the course late?',
  "If I miss the start date, can I still participate in the course's activities?"],
 '66f8212

In [259]:
doc_index = {d['id']: d for d in documents}

In [260]:
doc_index

{'2f401745': {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': '2f401745'},
 'e3e0b8eb': {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': 'e3e0b8eb'},
 'da77a135': {'text': "Yes, even if you don't register, you're still eligibl

In [261]:
final_results = []

for doc_id, questions in parsed_results_updated_json.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [262]:
final_results[:10]

[('When is the course start date and time?',
  'data-engineering-zoomcamp',
  '2f401745'),
 ('How can I subscribe to the course calendar?',
  'data-engineering-zoomcamp',
  '2f401745'),
 ('Where should I register before the course starts?',
  'data-engineering-zoomcamp',
  '2f401745'),
 ('How can I join the course announcements channel?',
  'data-engineering-zoomcamp',
  '2f401745'),
 ('Which Slack channel should I join for the course?',
  'data-engineering-zoomcamp',
  '2f401745'),
 ('What are the prerequisites for this course?',
  'data-engineering-zoomcamp',
  'e3e0b8eb'),
 ('Where can I find the prerequisites for this course?',
  'data-engineering-zoomcamp',
  'e3e0b8eb'),
 ('Do I need any knowledge before enrolling in this course?',
  'data-engineering-zoomcamp',
  'e3e0b8eb'),
 ('Is there a list of skills needed for this course?',
  'data-engineering-zoomcamp',
  'e3e0b8eb'),
 ('How can I check what I need to know before taking this course?',
  'data-engineering-zoomcamp',
  'e3e

In [265]:
import pandas as pd
df = pd.DataFrame(final_results, columns = ['question', 'course', 'document'])

In [266]:
df

Unnamed: 0,question,course,document
0,When is the course start date and time?,data-engineering-zoomcamp,2f401745
1,How can I subscribe to the course calendar?,data-engineering-zoomcamp,2f401745
2,Where should I register before the course starts?,data-engineering-zoomcamp,2f401745
3,How can I join the course announcements channel?,data-engineering-zoomcamp,2f401745
4,Which Slack channel should I join for the course?,data-engineering-zoomcamp,2f401745
...,...,...,...
4592,How do I destroy AWS infrastructure created wi...,mlops-zoomcamp,1b4aaffc
4593,What command initializes Terraform with the pr...,mlops-zoomcamp,1b4aaffc
4594,Which command removes infrastructure when usin...,mlops-zoomcamp,1b4aaffc
4595,Where do I add the state file configuration fo...,mlops-zoomcamp,1b4aaffc


In [267]:
df.to_csv('ground-truth-data.csv', index = False)