In [5]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

zsh:1: command not found: wget


In [6]:
import minsearch

In [9]:
import json

In [11]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [12]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [13]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [11]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [17]:
q = 'the course has already started, can I still enroll?'

In [13]:
index.fit(documents)

<minsearch.Index at 0x7fdafaa00cd0>

In [14]:
from openai import OpenAI

In [15]:
from dotenv import load_dotenv
import os
from mistralai import Mistral

# Load variables from .env file
load_dotenv()

# Read the key from the environment
api_key = os.getenv("OPENAI_API_KEY")


mistral_key = os.getenv("MISTRAL_API_KEY")
model = "mistral-large-latest"

client_mistral = Mistral(api_key=mistral_key)


# Initialize the client
client = OpenAI(api_key=api_key)
#client = OpenAI()

In [18]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"Whether you can still enroll in a course that has already started typically depends on the policies of the institution or platform offering the course. Here are a few steps you can take:\n\n1. **Check the Institution's Policies**: Look for information on the course's webpage or the institution's website regarding late enrollment.\n\n2. **Contact the Instructor or Administrator**: Reach out directly to the course instructor or the administrative office to inquire if late enrollment is possible and if there are any additional requirements.\n\n3. **Consider the Implications**: If late enrollment is allowed, consider how missing the first few sessions might impact your ability to catch up with the course material.\n\n4. **Look for Alternatives**: If enrollment is not possible, ask if the course will be offered again in the future or if there are alternative courses or resources available.\n\nMake sure to act quickly, as continuing to delay might further complicate your ability to catch up

In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [36]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        # context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        context = context + f"question: {doc['question']}\nanswer: {doc['text']}\n\n"

    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [21]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [20]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [21]:
rag(query)

'To run Kafka with Java, navigate to the project directory and use the following command in the terminal:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```'

In [22]:
rag('the course has already started, can I still enroll?')

'Yes, you can still enroll even after the course has started. You are eligible to submit the homework, but please be mindful of the deadlines for turning in the final projects.'

In [23]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [24]:
from elasticsearch import Elasticsearch

In [2]:
es_client = Elasticsearch('http://localhost:9200') 

In [3]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [25]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [26]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:04<00:00, 215.38it/s]


In [28]:
query = 'I just disovered the course. Can I still join it?'

In [29]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [30]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
rag(query)

"Yes, you can still join the course even if you discovered it after the start date. You are eligible to submit the homework assignments even without registration. However, be mindful of the deadlines for the final projects, so it's best not to leave everything until the last minute."

In [45]:
def run_mistral(user_message, model="mistral-large-latest"):
    messages = [
        {
            "role": "user", "content": user_message
        }
    ]
    chat_response = client_mistral.chat.complete(
        model=model,
        messages=messages
    )
    return (chat_response.choices[0].message.content)

In [33]:
def rag_mistral(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = run_mistral(prompt)
    return answer

In [46]:
rag_mistral(q)

'Yes, you can still enroll even if the course has already started.'

In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)    

In [2]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 948/948 [00:03<00:00, 238.42it/s]


In [23]:
def elastic_search_new(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    result_scores = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
        result_scores.append(hit['_score'])
    
    return result_docs,result_scores

In [44]:
def rag(query):
    search_results, result_score = elastic_search_new(query)
    prompt = build_prompt(query, search_results)
    #print(len(prompt))
    answer = llm(prompt)
    return answer, result_score,prompt

In [33]:
rag('How do I execute a command in a running docker container?')


1637


('To execute a command in a running Docker container, you can use the following steps:\n\n1. Find the container ID by using the command:\n   ```\n   docker ps\n   ```\n\n2. Execute the command in the specific container by using:\n   ```\n   docker exec -it <container-id> <command>\n   ```\n\nFor example, to start a bash session in the running container, you would use:\n```\ndocker exec -it <container-id> bash\n```',
 [84.050095, 51.04628, 49.938507])

In [35]:
rag('How do I debug a docker container?')

1539


('To debug a Docker container, launch the container image in interactive mode and override the entrypoint to start a bash command using the following command:\n\n```bash\ndocker run -it --entrypoint bash <image>\n```\n\nIf the container is already running, you can execute a command in the specific container by following these steps:\n\n1. List the running containers to find the container ID:\n   ```bash\n   docker ps\n   ```\n2. Execute a bash shell inside the running container using the container ID:\n   ```bash\n   docker exec -it <container-id> bash\n   ```',
 [103.06507, 51.04628, 43.277237])

In [41]:
rag('How do I copy files from a different folder into docker container’s working directory?')

1528


('You can copy files from your local machine into a Docker container using the `docker cp` command. The basic syntax for copying a file or directory into a running Docker container is as follows:\n\n```bash\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\n```',
 [183.81017, 79.590385, 55.26839])

In [38]:
rag('How do Lambda container images work?')

1237


('Lambda container images allow you to package and deploy your AWS Lambda functions using container images. This process is detailed in the AWS documentation, which explains how Lambda functions are initialized and how to create and manage these container images. For an in-depth understanding, you can refer to the following resources: \n- [Creating Lambda container images](https://docs.aws.amazon.com/lambda/latest/dg/images-create.html)\n- [Lambda runtimes API](https://docs.aws.amazon.com/lambda/latest/dg/runtimes-api.html)',
 [114.34481, 44.687473, 41.297256])

In [31]:
rag('How can I annotate a graph?')

("To annotate a graph, you can use Matplotlib's annotation method by providing an X,Y point and adding an arrow and text to the graph. For example, the following code shows how to annotate a point with an arrow pointing to the optimal threshold and displaying related text:\n\n```python\nplt.annotate(f'Optimal Threshold: {optimal_threshold:.2f}\\nOptimal F1 Score: {optimal_f1_score:.2f}',\n             xy=(optimal_threshold, optimal_f1_score),\n             xytext=(0.3, 0.5),\n             textcoords='axes fraction',\n             arrowprops=dict(facecolor='black', shrink=0.05))\n```\n\nThis method helps highlight important points on your graph for better visualization and understanding.",
 [105.57553, 33.449196, 32.276794])

In [46]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o")

res = rag('How do Lambda container images work?') 

tokens = encoding.encode(res[0])
print(len(tokens)) 

tokens_prompt = encoding.encode(res[2])
print(len(tokens_prompt))

encoding.decode_single_token_bytes(63842)


97
293


b"You're"