In [171]:
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

In [172]:
import minsearch
import json
import anthropic

from IPython.display import Markdown

In [173]:
CONFIG = "config.json"
# Read the JSON file to read api key
with open(CONFIG, 'r') as file:
    data = json.load(file)

api_key = data["api-key"]

In [174]:
client = anthropic.Anthropic(api_key=api_key)

In [175]:
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

In [176]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [177]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [178]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [179]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [180]:
q = 'the course has already started, can I still enroll?'

In [181]:
index.fit(documents)

<minsearch.Index at 0x761ec25a40a0>

In [182]:
response = client.messages.create(
    # model="claude-3-opus-20240229",
    # model="claude-3-haiku-20240307",
    model="claude-3-sonnet-20240229",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": q}
    ]
)
Markdown(response.content[0].text)

I'm afraid I don't have enough context to determine if you can still enroll in a specific course that has already started. Policies around late enrollment usually depend on the specific school, program, and course.

Some general tips:

- Check the official academic calendar and course enrollment deadlines for late add/drop periods.

- Contact the professor, department, or registrar's office directly to inquire about their late enrollment policy for that particular course.

- Be prepared to provide a reasonable explanation for the late enrollment request.

- Ask if there are any penalties, extra requirements, or catches up work needed if allowed to enroll late.

- Find out the final deadline to enroll, as many schools have a cut-off date even for late adds.

- Be flexible, as the decision may depend on factors like space availability, how much of the course you've missed, and instructor approval.

The sooner you inquire, the better your chances, but policies can vary significantly. Getting the official late enrollment procedures from the school will give you the best answer for your specific situation.

In [183]:
def minisearch(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results

In [184]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QEUSTION based on CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE.
    
    QUESTION: {question}
    CONTEXT: 
    {context}
    """.strip()
    
    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [185]:
def llm(prompt):
    response = client.messages.create(
        # model="claude-3-opus-20240229",
        # model="claude-3-haiku-20240307",
        model="claude-3-sonnet-20240229",
        max_tokens=1024,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    # return Markdown(response.content[0].text)
    return response.content[0].text

In [186]:
query1 = 'the course has already started, can I still enroll?'
query2 = 'how do I run kafka?'

In [187]:
def minisearch_rag(query):
    search_results = minisearch(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [188]:
minisearch_rag(query1)

"Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [189]:
minisearch_rag(query2)

'NONE'

In [190]:
from elasticsearch import Elasticsearch

In [191]:
es_client = Elasticsearch('http://localhost:9200')
es_client.info()

ObjectApiResponse({'name': '703e4181e920', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'XFUwYlSGRmuDbmIJ3mjrsg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [192]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [193]:
from tqdm.auto import tqdm

In [194]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [195]:
def elastic_search(query):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [196]:
def elastic_rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [197]:
elastic_rag(query1)

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [198]:
elastic_rag(query2)

'Based on the given context from the FAQ database, there is no direct answer to the question "How do I run kafka?". The context mentions running Java Kafka producers/consumers in the terminal and checking Spark version compatibility with Kafka, but it does not provide specific instructions on running Kafka itself.\n\nTherefore, the answer is:\n\nNONE'