In [13]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [14]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [15]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7b0eaf91be60>

In [16]:
import google.generativeai as genai

genai.configure(api_key='AIzaSyDm-_p9WTtsHZjVmYjBlIB3ZJM71rt-nug')

model = genai.GenerativeModel('gemini-1.5-flash')

response = model.generate_content("the course has already started, can I still enroll?")
print(response.text)

It depends on the course.  Some courses allow late enrollment, while others do not.  You need to check with the instructor or the organization offering the course.  Look for information on their website or contact them directly.



In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [22]:
def llm(prompt):
    response = model.generate_content(prompt)
    return response.text

In [23]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [24]:
rag('how do I run kafka?')

'The provided text offers instructions for running Kafka in Java and Python contexts.  For Java,  in the project directory, run: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`.  For Python, create a virtual environment using `python -m venv env`, activate it using `source env/bin/activate` (or `env/Scripts/activate` on Windows), install requirements with `pip install -r ../requirements.txt`, and then run your Python files within that environment.  Remember to deactivate the environment using `deactivate` when finished.  Before running the Python files, ensure all Docker images are running.  If you encounter a "Permission denied" error with a build script, use `chmod +x build.sh`.  If you receive a "ModuleNotFoundError: No module named \'kafka.vendor.six.moves\'" error, install `kafka-python-ng` using `pip install kafka-python-ng`.\n'

In [25]:
rag('the course has already started, can I still enroll?')

"Yes, you can still submit homeworks even if you don't register.  However, remember that there are deadlines for final projects.\n"

In [26]:
from qdrant_client import QdrantClient, models

In [27]:
from qdrant_client import QdrantClient, models

In [28]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [29]:
collection_name = "zoomcamp-faq"

In [30]:
qd_client.delete_collection(collection_name=collection_name)

True

In [31]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [32]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)


UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [33]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [34]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [35]:
question = 'I just discovered the course. Can I still join it?'

In [36]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [37]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [40]:
rag('how do I run kafka?')

vector_search is used


'The provided text describes how to run Kafka producers and consumers using Java and Python, and troubleshooting steps for common errors.  There is no single command to "run Kafka".  Instead, the instructions depend on whether you are using Java or Python, and the specific application (producer, consumer, etc.).  For Java applications, you run a command like `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java` from the project directory.  For Python applications, you need to set up a virtual environment (`python -m venv env`), activate it (`source env/bin/activate`), install requirements (`pip install -r ../requirements.txt`), and then run the Python script.  Before running either Java or Python applications, ensure your Kafka broker Docker container is running (`docker ps` and `docker compose up -d`).\n'