In [1]:
import os
# os.environ["PINECONE_API_KEY"] = "..." # OR have this in your environment variables
import getpass

os.environ["PINECONE_API_KEY"] = getpass.getpass("Pinecone API Key:")
os.environ["OPENAI_API_KEY"] = getpass.getpass("Anyscale Endpoint API Key:")

from llama_index.vector_stores import PineconeVectorStore
from llama_index.vector_stores.types import VectorStoreQuery, VectorStoreQueryMode

vector_store = PineconeVectorStore(
    index_name="quickstart",
    environment="us-west1-gcp-free",
)



  from tqdm.autonotebook import tqdm


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [2]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
model_name="sentence-transformers/all-mpnet-base-v2", 

def retrieve(query, limit=3750):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    query_embedding = embeddings.embed_query(query)

    vectorStoreQuery = VectorStoreQuery(
        query_embedding=query_embedding,
        similarity_top_k=3,  
        mode=VectorStoreQueryMode.DEFAULT,
        query_str=None,  
        alpha=None,
        filters=None 
    )

    res = vector_store.query(vectorStoreQuery)
    contexts = []

    for node in res.nodes[:]:
        text = node.text
        context = {'text': text}
        contexts.append(context)

    prompt_start = "Answer the question based on the context below.\n\nContext:\n"
    prompt_end = f"\n\nQuestion: {query}\nAnswer:"
    prompt = prompt_start
    for i in range(len(contexts)):
        text_to_add = "\n\n---\n\n".join([context['text'] for context in contexts[:i+1]])
        if len(prompt + text_to_add + prompt_end) >= limit:
            break
        prompt += text_to_add

    prompt += prompt_end
    return prompt

In [35]:

question ="""'Does 'Ray Serve' support streaming?"""
prompt = retrieve(question)


In [36]:
prompt

"Answer the question based on the context below.\n\nContext:\nAdvanced Guides#\n\nIf you’re new to Ray Serve, we recommend starting with the Ray Serve Quickstart.\n\nUse these advanced guides for more options and configurations:\n\nPass Arguments to Applications\n\nPerformance Tuning\n\nDynamic Request Batching\n\nIn-Place Updates for Serve\n\nDevelopment Workflow\n\nRay Serve Dashboard\n\nExperimental Java API\n\nMigrate from 1.x to 2.x\n\nExperimental gRPC SupportAdvanced Guides#\n\nIf you’re new to Ray Serve, we recommend starting with the Ray Serve Quickstart.\n\nUse these advanced guides for more options and configurations:\n\nPass Arguments to Applications\n\nPerformance Tuning\n\nDynamic Request Batching\n\nIn-Place Updates for Serve\n\nDevelopment Workflow\n\nRay Serve Dashboard\n\nExperimental Java API\n\nMigrate from 1.x to 2.x\n\nExperimental gRPC Support\n\n---\n\nAdvanced Guides#\n\nIf you’re new to Ray Serve, we recommend starting with the Ray Serve Quickstart.\n\nUse the

In [45]:
#Example using Python
import os
import requests

s = requests.Session()

#api_base = os.getenv("OPENAI_API_BASE")
api_base = "https://api.endpoints.anyscale.com/v1"
token = os.getenv("OPENAI_API_KEY")
url = f"{api_base}/chat/completions"
body = {
  "model": "meta-llama/Llama-2-70b-chat-hf",
  "messages": [{"role": "system", "content": prompt}, {"role": "user", "content": question}],
  "temperature": 0.7
}

with s.post(url, headers={"Authorization": f"Bearer {token}"}, json=body) as resp:
    print(resp.json())


{'id': 'meta-llama/Llama-2-70b-chat-hf-86e450c1-53fd-4b1a-acc3-b83046823873', 'object': 'text_completion', 'created': 1692031409, 'model': 'meta-llama/Llama-2-70b-chat-hf', 'choices': [{'message': {'role': 'assistant', 'content': 'Yes, Ray Serve supports streaming. One of the advanced guides listed is "Experimental gRPC Support," which suggests that Ray Serve may have the ability to stream data using gRPC. Additionally, the guide on "Dynamic Request Batching" may also be relevant for optimizing streaming requests.'}, 'index': 0, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 776, 'completion_tokens': 65, 'total_tokens': 841}}


In [37]:
#Examples using OpenAI SDK
import openai

# Make sure you have set the correct env vars
# You can also set the openai environment manually as shown
openai.api_base = "https://api.endpoints.anyscale.com/v1"


# Note: not all arguments are currently supported and will be ignored by the backend.
chat_completion = openai.ChatCompletion.create(
    model="meta-llama/Llama-2-70b-chat-hf",
    messages=[{"role": "system", "content": prompt}, {"role": "user", "content": question}],
    temperature=0.01
)
print(chat_completion)

{
  "id": "meta-llama/Llama-2-70b-chat-hf-15d6298f-a159-4e7f-9861-1f7b37e91111",
  "object": "text_completion",
  "created": 1692030385,
  "model": "meta-llama/Llama-2-70b-chat-hf",
  "choices": [
    {
      "message": {
        "role": "assistant",
        "content": "Yes, Ray Serve supports streaming. You can use the Experimental gRPC Support advanced guide to configure streaming for your applications."
      },
      "index": 0,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 776,
    "completion_tokens": 28,
    "total_tokens": 804
  }
}
