In [2]:
!pip install qdrant-client openai

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting qdrant-client
  Downloading qdrant_client-1.12.1-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.68.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant-client)
  Downloading protobuf-5.29.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting grpcio>=1.41.0 (from qdrant-client)
  Downloading grpcio-1.68.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant-client)
  Downloading h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting hyperframe<7,>=6.0 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client)
  Downloading hyperfra

In [3]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct


In [4]:
def get_os_env(name, default=None):
    try:
        return os.environ[name] 
    except:
        return default 


## Initiate Qdrant Client 
You can use read only API on production app. Here we are using write-read access API key so we can create a sample collection. 

You can find the apik key and host url from Qdrant Screen on TIR. 


In [22]:
# get these details from TIR >> Vector Database >> Qudrant 

qdrant_api_key = get_os_env("QDRANT_API_KEY", "") 
qdrant_host_url = get_os_env("QDRANT_HOST_URL", "")


In [23]:
qdrant_client = QdrantClient(host=qdrant_host_url, port=6333, api_key=qdrant_api_key)


In [33]:
#qdrant_client.delete_collection('kb')

True

In [34]:
collection_name = "kb"
vectors_config=VectorParams(size=1024, distance=Distance.DOT)
qdrant_client.create_collection(
    collection_name=collection_name, vectors_config=vectors_config, shard_number=6, replication_factor=3
)

True

## Initiate Client for Vector Embedding
You can use genAI option in TIR to use embedding api.  Get the details like API URL, Key and Model name from TIR. 


In [10]:
from openai import OpenAI

In [12]:
embedding_api_url=get_os_env("EMBEDDING_API_URL", <EMBEDDING_API_URL>)
embedding_api_key=get_os_env("EMBEDDING_API_KEY", <TIR_AUTH_TOKEN>)
embedding_model=get_os_env("EMBEDDING_MODEL", 'bge-large-en-v1_5')


In [13]:
embedding_client =  OpenAI(
  base_url = embedding_api_url, 
  api_key = embedding_api_key
)

Lets add sample data

In [27]:
text_embedding = embedding_client.embeddings.create(input="The key to survival is to stay alert", model=embedding_model, encoding_format="float").data[0].embedding

In [28]:
text_embedding

[0.03436364,
 0.063594855,
 -0.0729748,
 0.03979103,
 -0.018966367,
 -0.016591886,
 -0.025263911,
 0.0160167,
 0.034127664,
 0.025131175,
 0.047283188,
 -0.0051324232,
 -0.0101616075,
 0.01895162,
 -0.011061257,
 -0.041324854,
 -0.03327226,
 -0.0013089154,
 0.0077576283,
 0.01370121,
 0.010758916,
 0.061884046,
 -0.03725431,
 -0.03194491,
 -0.04082341,
 0.033065785,
 -0.034216154,
 -0.012875303,
 0.061530083,
 0.09958081,
 0.035278033,
 -0.03610394,
 0.020352712,
 -0.050940774,
 -0.012189505,
 -0.008959618,
 0.041088883,
 -0.07279782,
 0.011452088,
 -0.03436364,
 0.0028740831,
 -0.01780125,
 0.05436239,
 0.0014139974,
 -0.056014206,
 -0.031738434,
 -0.008030472,
 -0.006256984,
 0.042947173,
 -0.029260712,
 -0.003956243,
 0.035749983,
 0.062120017,
 0.0028482736,
 0.00067796285,
 0.023154898,
 -0.01691635,
 0.018523918,
 -0.009048108,
 0.0043102033,
 0.034127664,
 -0.032977294,
 0.043006167,
 -0.02437901,
 -0.02091315,
 0.035749983,
 0.020662427,
 -0.014593485,
 -0.018376434,
 0.0116143

In [35]:
points = [
    PointStruct(id=1, vector=text_embedding, payload={"text": "The key to survival is to stay alert"})
]
qdrant_client.upsert(collection_name=collection_name, points=points, wait=True)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

## Search and prepare context

In [40]:
query = "what is key to survival"
query_embedded = embedding_client.embeddings.create(input=query, model=embedding_model, encoding_format="float").data[0].embedding
search_results = qdrant_client.search(collection_name=collection_name, query_vector=query_embedded, limit=3)

In [41]:
search_results

[ScoredPoint(id=1, version=0, score=0.7728851, payload={'text': 'The key to survival is to stay alert'}, vector=None, shard_key=None, order_value=None)]

In [46]:
context = []
for ctx in search_results:
    context.append(search_results[0].payload['text'])
p


In [48]:
final_context = ' \n '.join(context[0:])

In [49]:
final_context

'The key to survival is to stay alert'

## Option 1: Call gen AI
Here we are using open AI client but in case you are using vllm, this can be done locally. 
You can use VLLM python library - https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py

In [None]:

# Get API URL and Key from Gen AI 

llm_api_url=get_os_env("LLM_API_URL", <GET_TIR_ENDPOINT>)
llm_api_key=get_os_env("LLM_API_KEY", <GET_AUTH_TOKEN_FROM_TIR>)

llm_client = OpenAI(
  base_url = llm_api_url, 
  api_key = llm_api_key
)

def generate_llm(query, context=None):
    prompt=query if context is None else f'{context}\n\n{query}'

    completion = llm_client.chat.completions.create(
    model='llama3_2_3b_instruct',
    messages=[{"role":"user","content":prompt}],
    temperature=0.5,
    max_tokens=1024,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=1,
    stream=True
  )
    
    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            print(chunk.choices[0].delta.content, end="")
            return chunk.choices[0].delta.content

In [None]:
query="what is the key to survival?"
generate_llm(query, prepare_context(query))

## Option 2: Use local VLLM 

In [52]:
from vllm import LLM, SamplingParams
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(model="meta-llama/Meta-Llama-3.1-70B-Instruct)

def generate_vllm(query, context=None):
    prompt=query if context is None else f'{context}\n\n{query}'

    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate([prompt], sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        return generated_text

IndentationError: unexpected indent (2068080296.py, line 3)

In [None]:
query="what is the key to survival?"
generate_vllm(query, prepare_context(query))