# Setup

In [1]:
import json
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from openai import OpenAI
import os,sys

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
#create index schema

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

#initialise indice
# This Elasticsearch Python client method creates an index with the specified settings
index_name = "course-questions2"
es = Elasticsearch("http://localhost:9200")
#response = es.indices.insert(index=index_name, body=index_settings)

# load json input dictionary into the indice
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:30<00:00, 30.98it/s]


In [7]:
query = "How do I execute a command in a running docker container?"

In [26]:
# Define a search query
search_query = {
    "size": 3,
    "query": {
        "multi_match": {
            "query": query,
            "fields": ["question^4", "text"],
            "type": "best_fields"
        }
    }
}

In [24]:
response = es.search(index=index_name, body=search_query)
print(response['hits']['hits'][0]['_score'])

84.17781


In [46]:
query = "How do I execute a command in a running docker container?"
# Define a search query
search_query = {
    "size": 6,
    "query": {
        "multi_match": {
            "query": query,
            "fields": ["question^4", "text"],
            "type": "best_fields"
        }
    }
}
response = es.search(index="course-questions", body=search_query)
print(response['hits']['hits'][0]['_source']['question'])

How do I debug a docker container?


In [47]:
# Check if there are any hits
if response['hits']['hits']:
    # Iterate through the top 3 hits and print the 'question' field
    for hit in response['hits']['hits']:
        print(hit['_source']['question'])
else:
    print("No results found.")

How do I debug a docker container?
How do I debug a docker container?
PGCLI - running in a Docker container
PGCLI - running in a Docker container
Running multiple services in a Docker container
Running multiple services in a Docker container


In [52]:
context = ""
documents = [hit["_source"] for hit in response['hits']['hits']]
context_docs = documents

for doc in context_docs:
    doc_str = f"Question: {doc['question']}\nAnswer: {doc['text']}\n\n"
    context += doc_str

In [63]:
context_template = f"""
Q: {query}
A: {context}
""".strip()

In [65]:
len(context_template)

4636

In [66]:
4636/2

2318.0

In [7]:
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key = api_key)

# Chat bot

In [8]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents
    
def build_context(documents):
    context = ""

    for doc in documents:
        doc_str = f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
        context += doc_str
    
    context = context.strip()
    return context


def build_prompt(user_question, documents):
    context = build_context(documents)
    return f"""
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

def ask_openai(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer

In [9]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")

qa_bot("I can't connect to postgres port 5432, my password doesn't work")

qa_bot("how can I run kafka?")

'To run Kafka, you can find the schema registry URL in Confluent Cloud by navigating to Environment, default (or your named environment), then clicking on the right navigation bar, then selecting "Stream Governance API" to find the URL under "Endpoint." Don\'t forget to create credentials from the Credentials section below it.'