<a href="https://colab.research.google.com/github/seanlee10/gen-ai-playground/blob/main/notebooks/anthropic_prompt_caching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prompt Caching

Prompt caching is a powerful feature that optimizes your API usage by allowing resuming from specific prefixes in your prompts. This approach significantly reduces processing time and costs for repetitive tasks or prompts with consistent elements.

The cache has a 5 minute time to live (TTL). Currently, “ephemeral” is the only supported cache type, which corresponds to this 5-minute lifetime. Find out more at https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching

# Install Dependencies

In [None]:
%pip install -qU yt-dlp anthropic

# Retrieve Content from Notion

In [4]:
import requests
import json
import time
import os
import anthropic
from google.colab import userdata

auth_token = userdata.get('notion')

client = anthropic.Anthropic(
    api_key=userdata.get('anthropic')
)

def fetch_content(page_id):
  # URL of the API endpoint
  url = f"https://api.notion.com/v1/blocks/{page_id}/children?page_size=100"

  # Headers including the authorization
  headers = {
      "Authorization": f"Bearer {auth_token}",
      "Content-Type": "application/json",
      "Notion-Version": "2022-02-22"
  }

  try:
      # Make the GET request
      response = requests.get(url, headers=headers)
      content = ""

      # Check if the request was successful
      if response.status_code == 200:
          # Request was successful
          data = response.json()  # Assuming the response is in JSON format

          for block in data['results']:
            if block['type'] == 'paragraph':
              content += block['paragraph']['rich_text'][0]['text']['content']

          print("Request successful!")
          return content
      else:
          # Request failed
          print(f"Request failed with status code: {response.status_code}")
          print("Response content:", response.text)

  except requests.exceptions.RequestException as e:
      # Handle any exceptions that occurred during the request
      print(f"An error occurred: {e}")
      raise e

# Test Retrival of a Notion Page

In [None]:
# Replace the Page ID with your own
content = fetch_content("17d11dd8536b804f81c0e280dd688f8f")
print(content)

# Define an function for regular API Call

In [None]:
def make_api_call():
  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "text",
                  "text": "<transcript>" + content + "</transcript>",
              },
              {
                  "type": "text",
                  "text": "Who is the interviewee of this interview? Only output the name"
              }
          ]
      }
  ]

  start_time = time.time()
  response = client.messages.create(
      model="claude-3-5-sonnet-latest",
      max_tokens=300,
      messages=messages,
  )
  end_time = time.time()

  return response, end_time - start_time

# Define an function for Cached API Call

In [None]:
def make_cached_api_call():
  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "text",
                  "text": "<transcript>" + content + "</transcript>",
                  "cache_control": {"type": "ephemeral"}
              },
              {
                  "type": "text",
                  "text": "Who is the interviewee of this interview? Only output the name"
              }
          ]
      }
  ]

  start_time = time.time()
  response = client.messages.create(
      model="claude-3-5-sonnet-latest",
      max_tokens=300,
      messages=messages,
      extra_headers={"anthropic-beta":"prompt-caching-2024-07-31"}
  )
  end_time = time.time()

  return response, end_time - start_time


# Compare the difference

In [10]:
response, call_time = make_api_call()

print(f"API call time: {call_time:.2f} seconds")
print(f"API call input tokens: {response.usage.input_tokens}")
print(f"API call output tokens: {response.usage.output_tokens}")

print("\nResult:")
print(response.content)

cached_response, cached_time = make_cached_api_call()

print(f"Cached API call time: {cached_time:.2f} seconds")
print(f"Cached API call input tokens: {cached_response.usage.input_tokens}")
print(f"Cached API call output tokens: {cached_response.usage.output_tokens}")

print("\nResult:")
print(cached_response.content)

API call time: 2.43 seconds
API call input tokens: 18234
API call output tokens: 20

Result:
[TextBlock(text='The interviewee is Jensen Huang, who is the CEO of NVIDIA.', type='text')]
Cached API call time: 1.41 seconds
Cached API call input tokens: 18
Cached API call output tokens: 17

Result:
[TextBlock(text='Jensen Huang, the CEO and co-founder of NVIDIA.', type='text')]
