In [None]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

In [None]:
!head documents.json

<b>Load the documents</b>

In [1]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

<b>Connecting and Loading ElasticSearch </b>

In [2]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '3689fda5931c', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'K93Hg-FETy6DsG2obzSrbA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

<b>Create indexes for the document </b>

In [3]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [4]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|████████████████████████████████████████████████████████████████████████| 948/948 [00:22<00:00, 41.43it/s]


In [5]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [6]:
response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.


Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terra

In [7]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [8]:
!direnv allow

In [9]:
from dotenv import load_dotenv
import os

In [10]:
import os
api_key=os.getenv('GROQ_API_KEY')
print(api_key)

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


In [11]:
import os

from groq import Groq

client = Groq(
    api_key=os.getenv("GROQ_API_KEY"),
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Explain the importance of fast language models",
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

Fast language models, also known as accelerated language models or efficient language models, are language models that have been optimized for speed and parallelization, allowing them to process large amounts of text data quickly and efficiently. The importance of fast language models can be summarized as follows:

1. **Speed and Efficiency**: Fast language models enable faster processing of large datasets, which is crucial in many applications, such as:
	* Text classification, sentiment analysis, and topic modeling, where speed and efficiency are critical for real-time decision-making.
	* Information retrieval, where rapid processing of queries is essential for providing timely and relevant results.
2. **Scalability**: Fast language models can handle larger datasets and more complex computations, making them suitable for:
	* Big data analytics, where processing large datasets quickly is essential for extracting valuable insights.
	* Machine learning, where large datasets require fast 

In [14]:
def build_context(documents):
    context = ""

    for doc in documents:
        doc_str = f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
        context += doc_str
    
    context = context.strip()
    return context


def build_prompt(user_question, documents):
    context = build_context(documents)
    return f"""
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

def ask_openai(prompt, model="llama3-8b-8192"):
    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer

In [15]:
qa_bot("how can I run kafka?")

'To run Kafka, you can run the producer/consumer/examples in the terminal using the following command:\n\n`java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`\n\nNote that you will need to replace `<jar_name>` with the actual name of your JAR file.'

In [16]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")

'I see you\'re getting an "invalid reference format: repository name must be lowercase" error!\n\nAccording to the context, I\'d say the problem is likely with the Docker volume mounting. Make sure you\'re using lowercase letters for the repository name. Try renaming your folder or replacing the "-v" part with one of the options provided in the text.\n\nFor example, you could try this:\n\n`-v /c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data`\n\nOr this:\n\n`-v //c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data`\n\nOr even this:\n\n`-v "//c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data"`\n\nRemember to check the quotes\' position and placement!'

In [17]:
qa_bot("I can't connect to postgres port 5432, my password doesn't work")

"It seems like you're experiencing an issue with connecting to Postgres port 5432, and the password is not working. \n\nIt could be that the port 5432 is taken by another PgSQL server running on your machine. You can try using a different port instead of the default 5432. \n\nAlso, if you have a service in Windows running Postgres, stopping that service should resolve the issue."