In [1]:
import polars as pl

import requests 

import pickle

In [2]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [3]:
type(documents_raw)

list

In [4]:
len(documents_raw)

3

In [5]:
type(documents_raw[0])

dict

In [6]:
for key in documents_raw[0].keys():
    print(f'type({key}) = {type(documents_raw[0][key])}')

type(course) = <class 'str'>
type(documents) = <class 'list'>


In [7]:
documents_raw[0]['course']

'data-engineering-zoomcamp'

In [8]:
documents_raw[0]['documents'][0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

In [9]:
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [10]:
len(documents)

948

In [11]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [12]:
df = pl.DataFrame(documents, schema=['course', 'section', 'question', 'text'])
df.head()

course,section,question,text
str,str,str,str
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What are the prerequi…","""GitHub - DataTalksClub data-en…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Can I still join the …","""Yes, even if you don't registe…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - I have registered for…","""You don't need it. You're acce…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What can I do before …","""You can start by installing an…"


In [13]:
df['course'].value_counts(sort=True)

course,count
str,u32
"""data-engineering-zoomcamp""",435
"""machine-learning-zoomcamp""",375
"""mlops-zoomcamp""",138


In [14]:
df_de = df.filter( pl.col('course').eq('data-engineering-zoomcamp') )
df_de.head()

course,section,question,text
str,str,str,str
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What are the prerequi…","""GitHub - DataTalksClub data-en…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Can I still join the …","""Yes, even if you don't registe…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - I have registered for…","""You don't need it. You're acce…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What can I do before …","""You can start by installing an…"


In [15]:
df_de['section'].value_counts(sort=True)

section,count
str,u32
"""Module 1: Docker and Terraform""",116
"""Module 4: analytics engineerin…",74
"""Module 5: pyspark""",56
"""General course-related questio…",44
"""Module 2: Workflow Orchestrati…",38
…,…
"""Workshop 2 - RisingWave""",18
"""Project""",16
"""Workshop 1 - dlthub""",4
"""Triggers in Mage via CLI""",2


In [16]:
df.write_parquet('df_qa.parquet')
with open("documents.pkl", "wb") as f:
    pickle.dump(documents, f)