In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-06-23 18:30:27--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-06-23 18:30:27 (22.7 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [4]:
import minsearch
import json

In [5]:
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)

In [16]:
#here we perform a rearrangement of the documents structure.

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [17]:
index = minsearch.Index(
    text_fields=['question','text','section'],   #notice that the text fields headers almost stay the same
    keyword_fields = ['course']
)

#understand that the search engine create an embeddings. This embeddings are then passed to the LLM Model, which then returns a probability
#ranking of the answers to tell us the best results.

In [18]:
#now fit the data on index
index.fit(documents)

<minsearch.Index at 0x7520a5def1c0>

In [20]:
q = 'the course has already started, can I still enroll?'

In [37]:
boost = {'question':3.0,'section':0.5}  #ranking of each parameter of the system


results = index.search(
    query=q,
    filter_dict={'course':'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5
)

#notice the results here are just a search, no rankings.

In [38]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [23]:
#now lets integrate the llm model - we use chatgpt

In [24]:
from openai import OpenAI

In [29]:
client = OpenAI()  #instantiate model

In [26]:
q

'the course has already started, can I still enroll?'

In [27]:
response = client.chat.completions.create(
    model = 'gpt-4o',
    messages = [{'role':'user','content':q}]
)

In [30]:
response   #- notice here we haven't connected our database/context

ChatCompletion(id='chatcmpl-9dMdROnzc0YM130wRU5xKwF5Vy88m', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Many courses have policies that allow students to enroll even after the course has started, although this can depend on a variety of factors including the nature of the course, the institution's policies, and how long after the start date you're looking to enroll.\n\nHere are a few steps you can take to find out if you can still enroll:\n\n1. **Contact the Course Instructor or Coordinator:** They often have the most flexibility and can give you an accurate answer about late enrollment.\n   \n2. **Check the Institution’s Policies:** Many institutions have a grace period for late enrollments. Look for this information on the institution's website or in the course catalog.\n\n3. **Consider Online Courses:** Some online courses offer more flexibility for late enrollments, especially if they are self-paced.\n\n4. **Explain Your Situ

In [31]:
#with prompt, you need to explain what you do, best practice is to give it a role. context is reffered to as the database
prompt_template = """You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only facts from the CONTEXT when answering the QUESTION.
if the CONTEXT doesn't contain the answer, output NONE


QUESTION: {question}

CONTEXT:
{context}

"""

In [41]:
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [42]:
prompt = prompt_template.format(question=q, context=context).strip()

In [43]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only facts from the CONTEXT when answering the QUESTION.
if the CONTEXT doesn't contain the answer, output NONE


QUESTION: the course has already started, can I still enroll?

CONTEXT:
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final 

In [46]:
#put content into gpt

response = client.chat.completions.create(
    model = 'gpt-4o',
    messages = [{'role':'user','content':prompt}]  #so notice that building the prompt is to build the question and then answer.
)

response.choices[0].message.content

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [49]:
response

ChatCompletion(id='chatcmpl-9dMqtRDirrdeMsrewNMhom8QUpgqQ', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", role='assistant', function_call=None, tool_calls=None))], created=1719169663, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_888385ccad', usage=CompletionUsage(completion_tokens=43, prompt_tokens=514, total_tokens=557))

In [89]:
def search(query):
    boost = {'question':3.0,'section':0.5}  #ranking of each parameter of the system


    results = index.search(
        query=query,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [87]:
#search('how do I run kafka')

In [169]:
#turn the documents to prompt

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [99]:
def llm(prompt):
    response = client.chat.completions.create(
    model = 'gpt-4o',
    messages = [{'role':'user','content':prompt}]  #so notice that building the prompt is to build the question and then answer.
    )

    return response.choices[0].message.content

In [104]:
query = 'how do I run kafka'

In [103]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [105]:
answer

'To run Kafka, the instructions can vary depending on the context and the specific tools or libraries you are using. Here are some specific examples based on the context provided:\n\n1. **Java Kafka**:\n   - If you are working with Java Kafka, and you want to run a producer or consumer, you can use the following command in the project directory:\n   ```sh\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **Python Kafka**:\n   - If you are dealing with errors like "Module \'kafka\' not found" when running `producer.py`, you should create a virtual environment and install the required packages within that environment. Here\'s how you can do it:\n   ```sh\n   # Create a virtual environment\n   python -m venv env\n   \n   # Activate the virtual environment\n   source env/bin/activate  # On Windows, use `env\\Scripts\\activate`\n   \n   # Install the required packages from requirements.txt\n   pip install -r ../requirements.txt

In [106]:
rag('can i still enrol in the course?')

"Yes, you can still enroll in the course after the start date. Even if you don't register initially, you're eligible to submit the homeworks. However, be aware that there will be deadlines for turning in the final projects. So, make sure not to leave everything for the last minute."

In [110]:
#we want to use elastic-search inplace of min search

from elasticsearch import Elasticsearch

In [111]:
es_client = Elasticsearch('http://localhost:9200')

In [112]:
es_client.info()

ObjectApiResponse({'name': 'e0348a829e92', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'HVxsGU93QwSBal9w4KAACw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [114]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'

es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [115]:
#add progress bar
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [121]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:29<00:00, 32.21it/s]


In [132]:
query = "I just discovered the course. Can I still join?"

In [133]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [134]:
response = es_client.search(index=index_name, body=search_query)

In [135]:
results_docs = []

for hits in response['hits']['hits']:
    results_docs.append(hits['_source'])

In [136]:
results_docs

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [139]:
def elastic_search(query):
    search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}
    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hits in response['hits']['hits']:
        result_docs.append(hits['_source'])

    return result_docs

In [140]:
elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [141]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [142]:
rag(query)

"Yes, you can still join the course even if it has already started. You are eligible to submit the homework assignments regardless of when you join. However, be aware that there are deadlines for the final projects, so it's important not to leave everything for the last minute."

In [143]:
## Homework

#get the data

import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [147]:
#index the data

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:29<00:00, 32.37it/s]


In [148]:
query = "How do I execute a command in a running docker container?"

In [152]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

In [153]:
response = es_client.search(index=index_name, body=search_query)

In [154]:
response

ObjectApiResponse({'took': 14, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1035, 'relation': 'eq'}, 'max_score': 84.220634, 'hits': [{'_index': 'course-questions', '_id': 'zdKhRpABoXVR_ZPdrCI7', '_score': 84.220634, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}}, {'_index': 'course-questions', '_id': 'gdKwRpABoXVR_ZPdAiYJ', '_score': 84.220634, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker

In [155]:
results_docs = []

for hits in response['hits']['hits']:
    results_docs.append(hits['_source'])

In [156]:
results_docs

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Launch the container im

In [170]:

prompt = build_prompt(query, results_docs)

In [171]:
prompt

"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: How do I execute a command in a running docker container?\n\nCONTEXT:\nsection: 5. Deploying Machine Learning Models\nquestion: How do I debug a docker container?\nanswer: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)\n\nsection: 5. Deploying Machine Learning Models\nquestion: How do I debug a docker container?\nanswer: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the spec

In [172]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o")

In [173]:
len(encoding.encode(prompt))

544