In [1]:
import json
import minsearch

In [2]:
with open("documents.json", "r") as f:
    documents_raw = json.load(f)

In [3]:
documents = []

for course_data in documents_raw:
    for document in course_data['documents']:
        document["course"] = course_data["course"]
        documents.append(document)

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [5]:
index.fit(documents)

<minsearch.Index at 0x7f92a35e39e0>

In [6]:
query = "the course has already started, can I still enroll?"

In [7]:
boost = {"question": 3.0, "section": 0.5}
filters = {"course": "data-engineering-zoomcamp"}
results = index.search(query=query, boost_dict=boost, filter_dict=filters, num_results=5)

In [8]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [6]:
from llama_cpp import Llama

In [13]:
mistral_path = "/home/krm/projects/llama.cpp/models/mistral-7b-instruct-v0.1.Q5_K_S.gguf"

client = Llama(model_path=mistral_path)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/krm/projects/llama.cpp/models/mistral-7b-instruct-v0.1.Q5_K_S.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: 

In [11]:
response = client.create_chat_completion(
    messages=[
        {"role": "user", "content": query}
    ]
)


llama_print_timings:        load time =    3933.15 ms
llama_print_timings:      sample time =       5.46 ms /    70 runs   (    0.08 ms per token, 12808.78 tokens per second)
llama_print_timings: prompt eval time =    3933.01 ms /    23 tokens (  171.00 ms per token,     5.85 tokens per second)
llama_print_timings:        eval time =   18783.73 ms /    69 runs   (  272.23 ms per token,     3.67 tokens per second)
llama_print_timings:       total time =   22771.58 ms /    92 tokens


In [12]:
response["choices"][0]["message"]["content"]

" It depends on the specific course and the policy of the institution or organization offering the course. Some courses may have a deadline for enrollment, while others may allow students to enroll throughout the semester or academic year. It's best to check with the course instructor or the institution's website to see if enrollment is still possible."

In [13]:
prompt_template = """
You're a course teaching assistant. Answer the `QUESTION` based on the `CONTEXT`.
Use only the facts from the CONTEXT when answering the QUESTION.

# QUESTION:
{question}

# CONTEXT:
{context}
"""

In [14]:
result_texts = [f"section: {result["section"]}\nquestion: {result["question"]}\nanswer: {result["text"]}\n" for result in results]

In [15]:
context = "\n".join(result_texts)

In [16]:
print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start wit

In [17]:
prompt = prompt_template.format(question=query, context=context).strip()

In [18]:
print(prompt)

You're a course teaching assistant. Answer the `QUESTION` based on the `CONTEXT`.
Use only the facts from the CONTEXT when answering the QUESTION.

# QUESTION:
the course has already started, can I still enroll?

# CONTEXT:
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section: General course-related questions
quest

In [19]:
client = Llama(model_path=mistral_path, n_ctx=1024)

response = client.create_chat_completion(
    messages=[
        {"role": "user", "content": prompt}
    ]
)

response["choices"][0]["message"]["content"]

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/krm/projects/llama.cpp/models/mistral-7b-instruct-v0.1.Q5_K_S.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: 

" Based on the provided context, the course has already started. However, it is still possible to enroll and submit homeworks. There will be deadlines for turning in the final projects, so it is recommended to not leave everything for the last minute. After the course finishes, all materials will be kept, and you can follow the course at your own pace. Before the course starts, you can install and set up all the dependencies and requirements, look over the prerequisites and syllabus, and join the course Telegram channel and DataTalks.Club's Slack. Support is also available in the slack channel, and you can ask questions there."

In [8]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}
    filters = {"course": "data-engineering-zoomcamp"}
    results = index.search(query=query, boost_dict=boost, filter_dict=filters, num_results=5)
    return results

In [9]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the `QUESTION` based on the `CONTEXT`.
Use only the facts from the CONTEXT when answering the QUESTION.

# QUESTION:
{question}

# CONTEXT:
{context}
"""
    result_texts = [f"section: {result["section"]}\nquestion: {result["question"]}\nanswer: {result["text"]}\n" for result in search_results]
    context = "\n".join(result_texts)
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
    

In [10]:
def llm(prompt):
    response = client.create_chat_completion(
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return response["choices"][0]["message"]["content"]

In [11]:
query = "How can I run Kafka?"

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
client = Llama(model_path=mistral_path, n_ctx=1024)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/krm/projects/llama.cpp/models/mistral-7b-instruct-v0.1.Q5_K_S.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: 

In [15]:
rag(query)


llama_print_timings:        load time =  154134.17 ms
llama_print_timings:      sample time =      73.20 ms /   431 runs   (    0.17 ms per token,  5888.30 tokens per second)
llama_print_timings: prompt eval time =  184896.59 ms /   588 tokens (  314.45 ms per token,     3.18 tokens per second)
llama_print_timings:        eval time =  369120.64 ms /   430 runs   (  858.42 ms per token,     1.16 tokens per second)
llama_print_timings:       total time =  555772.88 ms /  1018 tokens


' To run Kafka, you can follow these steps:\n\n1. Create a virtual environment and install the required packages:\n```bash\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\n```\n2. Activate the virtual environment every time you need it:\n```bash\nsource env/bin/activate\n```\n3. Deactivate the virtual environment when you\'re done:\n```bash\ndeactivate\n```\nThis works on MacOS, Linux, and Windows. For Windows, the path is slightly different (it\'s `env/Scripts/activate`).\n\nTo run Java Kafka, you can use the following command in the project directory:\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nTo find the schema registry URL in Confluent Cloud, you can follow these steps:\n\n1. Go to the Environment section in Confluent Cloud.\n2. Select the default environment (or the environment you named it).\n3. Click on the right navigation bar and go to "Stream Governance API".\n4. You will f

In [32]:
from elasticsearch import Elasticsearch

In [33]:
es_client = Elasticsearch("http://localhost:9200")

In [34]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = "course-questions"
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [35]:
from tqdm import tqdm

In [36]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████████████| 948/948 [00:15<00:00, 60.62it/s]


In [46]:
query = "I just discovered the course. Can I still join?"

def elastic_search(query):
    es_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=es_query)
    result_docs = []
    for result in response['hits']['hits']:
        result_docs.append(result['_source'])
    return result_docs 

In [47]:
elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [48]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [50]:
rag(query)

Llama.generate: 51 prefix-match hit, remaining 489 prompt tokens to eval

llama_print_timings:        load time =  154134.17 ms
llama_print_timings:      sample time =      15.19 ms /    97 runs   (    0.16 ms per token,  6386.20 tokens per second)
llama_print_timings: prompt eval time =  133045.19 ms /   489 tokens (  272.08 ms per token,     3.68 tokens per second)
llama_print_timings:        eval time =   65413.89 ms /    96 runs   (  681.39 ms per token,     1.47 tokens per second)
llama_print_timings:       total time =  198665.49 ms /   585 tokens


' Based on the provided context, it appears that the course is a Data Engineering Bootcamp and it is offered in a self-paced mode. The course materials will be kept available after it finishes, and you can continue to work on your final capstone project. You can also ask questions in the Slack channel and tag the bot @ZoomcampQABot for assistance. It is not necessary to register for the course in order to start learning and submitting homework.'

In [2]:
!pip install -q tiktoken

In [3]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")

In [11]:
encoding.encode("I just discovered the course. Can I still join?")

[40, 1327, 16988, 290, 4165, 13, 4101, 357, 2928, 5863, 30]

In [17]:
encoding.decode_single_token_bytes(1)

b'"'

In [18]:
4/10 * 5

2.0