In [155]:
import requests
import elasticsearch
from elasticsearch import Elasticsearch
import openai
from elasticsearch.exceptions import BadRequestError
from openai import OpenAI
import tiktoken
from tqdm.auto import tqdm
import os
from dotenv import load_dotenv
load_dotenv()

True

### Read courses document and store them in a list

In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'

In [3]:
response = requests.get(docs_url)
response.raise_for_status()

In [4]:
doc_raw = response.json()

In [5]:
# number of courses
len(doc_raw)

3

In [6]:
doc_raw[0].keys()

dict_keys(['course', 'documents'])

In [7]:
for course in doc_raw:
    print(course['course'])

data-engineering-zoomcamp
machine-learning-zoomcamp
mlops-zoomcamp


In [55]:
documents = []
for course in doc_raw:
    course_name = course['course']
    for raw in course['documents']:
        raw['course'] = course_name
        documents.append(raw)

In [56]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [57]:
len(documents)

948

In [58]:
documents[900]

{'text': "Problem: I tried to run starter notebook on pipenv environment but had issues with no output on prints. \nI used scikit-learn==1.2.2 and python==3.10\nTornado version was 6.3.2\n\nSolution: The error you're encountering seems to be a bug related to Tornado, which is a Python web server and networking library. It's used by Jupyter under the hood to handle networking tasks.\nDowngrading to tornado==6.1 fixed the issue\nhttps://stackoverflow.com/questions/54971836/no-output-jupyter-notebook",
 'section': 'Module 4: Deployment',
 'question': 'Pipenv with Jupyter no output',
 'course': 'mlops-zoomcamp'}

### Index document in ElasticSearch

In [12]:
host = 'http://localhost:9200'
es = Elasticsearch(host)

In [13]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
          "course": {
            "type": "keyword"
          },
          "question": {
            "type": "text"
          },
          "text": {
            "type": "text"
          },
          "section": {
            "type": "text"
      }
    }
  }
}
index_name = "course-questions"

In [90]:
try:
    es.indices.create(index=index_name, body=index_settings)
except BadRequestError as err:
    pass

In [91]:
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [None]:
response = es.count(index=index_name)
print(f"Total documents: {response['count']}")

In [95]:
# TO DELETE AN ELASTIC INDEX
# es.indices.delete(index=index_name)

In [19]:
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [150]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp",
                    }
                }
            }
        }
    }

    response = es.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append({
            'document': hit['_source'],
            'score': hit['_score']
        })
    
    return result_docs

In [116]:
query = "How do execute a command on a Kubernetes pod?"
results = elastic_search(query)
for i, result in enumerate(results):
    print(f"Result {i+1}:")
    print(f"Score: {result['score']}")
    print(f"Question: {result['document']['question']}")
    print("---")

Result 1:
Score: 44.50556
Question: How do I debug a docker container?
---
Result 2:
Score: 35.433445
Question: Kubernetes-dashboard
---
Result 3:
Score: 33.70974
Question: How do I copy files from a different folder into docker container’s working directory?
---
Result 4:
Score: 33.2635
Question: How to run a script while a web-server is working?
---
Result 5:
Score: 32.589073
Question: How can I annotate a graph?
---


In [151]:
query = "How do copy a file to a Docker container?"
results = elastic_search(query)
for i, result in enumerate(results):
    print(f"Result {i+1}:")
    print(f"Score: {result['score']}")
    print(f"Question: {result['document']['question']}")
    print("---")

Result 1:
Score: 73.38676
Question: How do I debug a docker container?
---
Result 2:
Score: 66.688705
Question: How do I copy files from my local machine to docker container?
---
Result 3:
Score: 59.812744
Question: How do I copy files from a different folder into docker container’s working directory?
---


In [145]:
context_template = """
Q: {question}
A: {text}
""".strip()

In [152]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = "\n\n".join(context_template.format(**result['document']) for result in results)
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [153]:
print(len(build_prompt(query, results)))

1446


In [156]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [160]:
len(encoding.encode(build_prompt(query, results)))

320

In [85]:
def rag(query):
    search_results = elastic_search(query)
    print(search_results)
    prompt = build_prompt(query, search_results)
    print(prompt)
    answer = llm(prompt)
    return answer

In [86]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [139]:
# search_query = {
#     "size": 500,
#     "query": {
#         "bool": {
#             "filter": {
#                 "term": {
#                     "course": "machine-learning-zoomcamp"
#                 }
#             }
#         }
#     }
# }
# response = es.search(index=index_name, body=search_query)
# for hit in response['hits']['hits']:
#     if 'kubectl' in hit['_source']['text'].lower() or 'kubectl' in hit['_source']['section'].lower():
#         print(f"Question: {hit['_source']['question']}")
#         print(f"Section: {hit['_source']['section']}")
#         print("---")