# Retrieval Augmented Generation with Anthropic + Generated Q/A

In [None]:
%%capture
!pip install -U typing-extensions
!pip install anthropic
!pip install wikipedia

In [None]:
import anthropic
import wikipedia

API_KEY = "fill me"  # anthropic api key
MODEL_NAME = "claude-2.1"
client = anthropic.Anthropic(api_key=API_KEY)

def get_completion(prompt: str, temperature=0, system="You are an expert information retrieval system."):
    message = client.messages.create(
        model=MODEL_NAME,
        max_tokens=1024,
        temperature=temperature,
        messages=[
          {"role": "user", "content": prompt}
        ],
        system=system,
    )
    return message.content[0].text

def get_wikipedia_search_results(query: str, n_search_results_to_use=1):
  """Call the wikipedia API and get back article content, title, and source."""
  results: list[str] = wikipedia.search(query)
  search_results: list[dict] = []
  for result in results:
    if len(search_results) >= n_search_results_to_use:
      break
    try:
      page = wikipedia.page(result)
    except:
      # The Wikipedia API is a little flaky, so we just skip over pages that fail to load
      continue

    search_result = {
        "content": page.content,
        "title": page.title,
        "source": page.url
    }

    search_results.append(search_result)

  tokenizer = anthropic.Anthropic().get_tokenizer()
  for search_result in search_results:
    ids = tokenizer.encode(search_result['content']).ids[:5000]
    search_result['content'] = tokenizer.decode(ids)
  return search_results

## Question 1: Answer Questions with Wikipedia
Please complete the function ask_clawd() below so that Claude can answer questions using Wikipedia data.

Focus on the *quality* of your generated answers over their *latency*.

In [None]:
def ask_clawd(query: str):
    new_query = get_completion(f"Consider this prompt by user: {query}.  Rephrase the prompt for lookup in wikipedia, i.e. remove unnecessary words.", temperature=0.3, system='You are a helpful assistant.')
    print("%s -> %s", (query, new_query))
    search_results = get_wikipedia_search_results(new_query, n_search_results_to_use=20)

    #print(len(search_results))
    search_results_concat = '\n'.join(['<title>\n%s\n</title>\n<source>\n%s</source>\n<content>%s</content>' % (d['title'], d['source'], d['content']) for d in search_results])
    #print(search_results_concat)
    prompt = f"{search_results_concat}\nYou will follow these steps: 1) Give your thoughts inside <thinking></thinking> xml tags.\n 2) According to the information provided, answer the following question: {query}.  3) Give quotes that validate your answer in <quotes> </quotes> xml tags.  4. Give your final answer in <final_answer> </final_answer> xml tags."
    response =get_completion(prompt)
    pattern = r"<final_answer>(.*?)</final_answer>"
    answer = next(iter(re.findall(pattern, response, re.DOTALL)))
    return answer

### Some Sample Uses

In [None]:
questions = [
    "What's the name of the latest material that was claimed to be a room temperature superconductor?",
    "Which team did the Denver Nuggets beat in the 2023 NBA finals?",
    "Where are the two biggest particle accelerators in the continental United States?"
]

for question in questions:
  print(f"-------------{question}-------------")
  print(ask_clawd(question))

-------------What's the name of the latest material that was claimed to be a room temperature superconductor?-------------
%s -> %s ("What's the name of the latest material that was claimed to be a room temperature superconductor?", '"room temperature superconductor"')

The latest material claimed to be a room temperature superconductor is called LK-99 or copper-doped lead apatite.

-------------Which team did the Denver Nuggets beat in the 2023 NBA finals?-------------
%s -> %s ('Which team did the Denver Nuggets beat in the 2023 NBA finals?', '"2023 NBA finals Denver Nuggets opponent"')

Fictional 2023 NBA Finals opponent for the Denver Nuggets

-------------Where are the two biggest particle accelerators in the continental United States?-------------
%s -> %s ('Where are the two biggest particle accelerators in the continental United States?', 'The rephrased prompt for lookup in Wikipedia would be: two biggest particle accelerators in continental United States')

The two biggest par

## Make a multiple choice test

Generate 3 question multiple choice test about a given subject, where each question has four options, only one of which is true.

In [None]:
import re
import random
def generate_test_with_clawd(subject: str):
  subjects = []

  subjects = [subject] * 4
  questions = []
  for subject_alt in subjects:

    search_results = get_wikipedia_search_results(subject_alt, n_search_results_to_use=3)
    search_results_list = ['<title>\n%s\n</title>\n<source>\n%s</source>\n<content>%s</content>' % (d['title'], d['source'], d['content']) for d in search_results]
    template = f'<question>Example Question\n<answers>\n(A) Example Answer 1\n(B) Example Answer 2\n(C) Example Answer 3\n(D) Example Answer 4\n</answers>\n</question>'

    search_results_concat = '\n'.join(search_results_list)
    random.shuffle(search_results_list)
    prompt = f"{search_results_concat}\n\nQuestions so far (used before):\n {questions}.\n End of questions so far.\n\n\n  \nGive a totally new question on the subject {subject_alt}.  Be absolutely sure that the new question is not the same as any of the original questions so far.  Only one answer, choose A, B, C, or D randomly, is the correct answer.  The answers should be brief 1-2 words at most.  Follow this template:\n{template}.  Remember, ensure you make totally new questions compared to those already used before."
    #print(prompt)
    response = get_completion(prompt, temperature=0)
    #print(response)
    pattern = r"<question>(.*?)</question>"
    question1 = next(iter(re.findall(pattern, response, re.DOTALL)))
    questions.append(question1)

  return '\n'.join(questions)

### Some Sample Uses (feel free to add more)

In [None]:
subjects = [
    "The French Revolution",
    "Covid-19 in 2023"
]

for subject in subjects:
  print(f"-------------{subject}-------------")
  print(generate_test_with_clawd(subject))

-------------The French Revolution-------------
What event in July 1830 led to the overthrow of King Charles X?
<answers>  
(A) The July Revolution
(B) The June Rebellion
(C) The Reign of Terror
(D) The Storming of the Bastille
</answers>

What event in July 1830 led King Charles X to abdicate the throne? 
<answers>
(A) The June Rebellion
(B) The July Ordinances
(C) The Reign of Terror
(D) The Storming of the Bastille
</answers>

What event in 1789 led to the formation of the National Assembly in France?
<answers>  
(A) Storming of the Bastille
(B) Reign of Terror
(C) Fête de la Fédération 
(D) July Revolution
</answers>


What event in 1791 caused the royal family to flee Paris?
<answers>  
(A) The Flight to Varennes
(B) The Reign of Terror
(C) The Storming of the Bastille
(D) The July Revolution
</answers>

-------------Covid-19 in 2023-------------

What percent of the US population is estimated to have had COVID-19 by the end of 2022?
<answers>  
(A) 50% 
(B) 60%
(C) 70%  
(D) 80%


In [None]:
# we can easily extract the answers etc.
# we can also rephrase the target subject for wikipedia if required