<a href="https://colab.research.google.com/github/salinator-hub/Dspy-/blob/main/Using_Assert_and_suggest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install dspy-ai vllm

Collecting dspy-ai
  Downloading dspy_ai-2.1.9-py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.7/150.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vllm
  Downloading vllm-0.3.0-cp310-cp310-manylinux1_x86_64.whl (38.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting backoff~=2.2.1 (from dspy-ai)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting openai<2.0.0,>=0.28.1 (from dspy-ai)
  Downloading openai-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas~=2.1.1 (from dspy-ai)
  Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m
[?25

In [2]:
# Run server in foreground
# !python -m vllm.entrypoints.openai.api_server --model TheBloke/dolphin-2.6-mistral-7B-dpo-laser-AWQ --quantization awq

# Run server in the background
!nohup python -m vllm.entrypoints.openai.api_server --model TheBloke/dolphin-2.6-mistral-7B-dpo-laser-AWQ --quantization awq > server.log 2>&1 &
# stdout is redirected to a file `server.log` using `> server.log`.
# We use a quantized model prepared using AWQ quantization

In [3]:
# Run this cell again and again to monitor the status of the server.
# The server can take a few mintues to start.
# Once the server has started, you will see logs such as this:
# INFO 02-10 07:16:43 llm_engine.py:877] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%
!tail server.log

In [5]:
# Once the server is up and running, this should work
!curl http://localhost:8000/v1/models

curl: (7) Failed to connect to localhost port 8000 after 0 ms: Connection refused


In [6]:
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, BootstrapFinetune

In [7]:
lm = dspy.HFClientVLLM(model="TheBloke/dolphin-2.6-mistral-7B-dpo-laser-AWQ", port=8000, url="http://localhost")

dspy.settings.configure(lm=lm)

colbertv2 = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

# # NOTE: After you finish this notebook, you can use GPT-3.5 like this if you like.
# turbo = dspy.OpenAI(model='gpt-3.5-turbo-instruct')
# # In that case, make sure to configure lm=turbo below if you choose to do that.

dspy.settings.configure(rm=colbertv2)

In [9]:
predict = dspy.Predict('question -> answer')

predict(question="What is the capital of  ancient India?")

Prediction(
    answer='The capital of ancient India was Pataliputra, also known as Patna, which was the capital of the Magadha Empire.'
)

In [15]:
question = "What's something great about the ColBERT retrieval model?"

# 1) Declare with a signature, and pass some config.
classify = dspy.ChainOfThought('question -> answer', n=5)

# 2) Call with input argument.
response = classify(question=question)

# 3) Access the outputs.
response.completions.answer

["One great thing about the ColBERT retrieval model is that it's specifically designed to understand both the context of the query and the context of the passages that are returned in response to the query. Additionally, it uses a novel query-passage similarity metric which is based on the cosine similarity between the query and the passages, allowing it to"]

In [26]:
from typing import List
import numpy as np
import dsp

class KNN:
    def __init__(self, k: int, trainset: List[dsp.Example]):
        self.k = k
        self.trainset = trainset
        self.vectorizer = dsp.SentenceTransformersVectorizer()
        trainset_casted_to_vectorize = [" | ".join([f"{key}: {value}" for key, value in example.items() if key in example._input_keys]) for example in self.trainset]
        self.trainset_vectors = self.vectorizer(trainset_casted_to_vectorize).astype(np.float32)

    def __call__(self, **kwargs) -> List[dsp.Example]:
        with dsp.settings.context(vectorizer=self.vectorizer):
            input_example_vector = self.vectorizer([" | ".join([f"{key}: {val}" for key, val in kwargs.items()])])
            scores = np.dot(self.trainset_vectors, input_example_vector.T).squeeze()
            nearest_samples_idxs = scores.argsort()[-self.k:][::-1]
            train_sampled = [self.trainset[cur_idx] for cur_idx in nearest_samples_idxs]
            return train_sampled

## DSPy program with LM Assertions for multi-hop question-answering task with a retriever. We introduce two soft assertions
(suggestions):

(1) query to retriever should be less than 100 characters;

(2) query to retriever should differ from previous queries.

For instance, if the second suggestion fails, DSPy will construct a new prompt to retry the generate_query module with additional fields,
highlighting the previously generated query and a user-defined error message to help the LM refine its generation.


In [24]:
class MultiHopQAWithAssertions(dspy.Module):

  def forward(self, question):
    context, queries = [], [question]
    for hop in range(2):
      query = self.generate_query(context=context, question=question).query

      dspy.Suggest(len(query) < 100,
          "Query should be less than 100 characters")

      dspy.Suggest(is_query_distinct(query, queries),
          f"Query should be distinct from {queries}")
      context += self.retrieve(query).passages
      queries.append(query)
    return self.generate_answer(context=context, question=question)
