<a href="https://colab.research.google.com/github/ship9599/AI_TP/blob/main/llm_pricing_cost_quality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Evaluation Dataset

In [None]:
import getpass
import os
import time

financial_statements = {
  "income_statements": [
    {
      "period": "FY2023",
      "revenue": 9917000000,
      "cost_of_revenue": 1703000000,
      "general_and_administrative_expense": 2025000000,
      "research_and_development_expense": 1722000000,
      "selling_and_marketing_expense": 1763000000,
      "operating_income_loss": 1518000000,
      "net_income_loss": 4792000000
    },
    {
      "period": "FY2022",
      "revenue": 8399000000,
      "cost_of_revenue": 1499000000,
      "general_and_administrative_expense": 950000000,
      "research_and_development_expense": 1502000000,
      "selling_and_marketing_expense": 1516000000,
      "operating_income_loss": 1802000000,
      "net_income_loss": 1893000000
    },
    {
      "period": "FY2021",
      "revenue": 5992000000,
      "cost_of_revenue": 1156000000,
      "general_and_administrative_expense": 836000000,
      "research_and_development_expense": 1425000000,
      "selling_and_marketing_expense": 1186000000,
      "operating_income_loss": 429000000,
      "net_income_loss": -352000000
    }
  ],
  "balance_sheets": [
    {
      "period": "FY2023",
      "cash_and_cash_equivalents": 7378000000,
      "short_term_investments": 2244000000,
      "total_current_assets": 14861000000,
      "goodwill": 650000000,
      "total_assets": 16038000000,
      "current_accrued_liabilities": 2654000000,
      "current_accounts_payable": 137000000,
      "long_term_debt": 1987000000,
      "operating_lease_liabilities": 295000000,
      "other_non_current_liabilities": 218000000,
      "total_liabilities": 10478000000,
      "stockholders_equity": 5560000000
    },
    {
      "period": "FY2022",
      "cash_and_cash_equivalents": 6067000000,
      "short_term_investments": 2255000000,
      "total_current_assets": 12386000000,
      "goodwill": 656000000,
      "total_assets": 13708000000,
      "current_accrued_liabilities": 6359000000,
      "current_accounts_payable": 118000000,
      "long_term_debt": 1983000000,
      "operating_lease_liabilities": 372000000,
      "other_non_current_liabilities": 219000000,
      "total_liabilities": 8933000000,
      "stockholders_equity": 4775000000
    },
    {
      "period": "FY2021",
      "cash_and_cash_equivalents": 2013547000,
      "short_term_investments": 910700000,
      "total_current_assets": 8916386000,
      "goodwill": 652088000,
      "total_assets": 10491499000,
      "current_accrued_liabilities": 5139779000,
      "current_accounts_payable": 79898000,
      "long_term_debt": 1815562000,
      "operating_lease_liabilities": 56586000,
      "other_non_current_liabilities": 203470000,
      "total_liabilities": 7589716000,
      "stockholders_equity": 2901783000
    }
  ],
  "cash_flow_statements": [
    {
      "period": "FY2023",
      "net_income": 4792000000,
      "depreciation_and_amortization": 138000000,
      "shared_based_compensation": 1120000000,
      "net_cash_from_operating_activities": 3884000000,
      "net_cash_from_investing_activities": -1042000000,
      "plant_property_and_equipment": 160000000,
      "net_cash_from_financing_activities": -2430000000
    },
    {
      "period": "FY2022",
      "net_income": 1893000000,
      "depreciation_and_amortization": 81000000,
      "shared_based_compensation": 899000000,
      "net_cash_from_operating_activities": 3430000000,
      "net_cash_from_investing_activities": -28000000,
      "plant_property_and_equipment": 25000000,
      "net_cash_from_financing_activities": -689000000
    },
    {
      "period": "FY2021",
      "net_income": -674339000,
      "depreciation_and_amortization": 125876000,
      "shared_based_compensation": 3001948000,
      "net_cash_from_operating_activities": 2189694000,
      "net_cash_from_investing_activities": -1351955000,
      "plant_property_and_equipment": 125452000,
      "net_cash_from_financing_activities": 1431159000
    }
  ]
}

# Implement LLM-as-Judge using Opus

In [None]:
!pip install -U -q instructor xmltodict anthropic

In [None]:
os.environ["ANTHROPIC_API_KEY"] = getpass.getpass()

In [None]:
import anthropic
import instructor
from pydantic import BaseModel
from pydantic import Field
from enum import Enum
from typing import Optional, Union, List

# Patching the Anthropics client with the instructor for enhanced capabilities
anthropic_client = instructor.patch(
    create=anthropic.Anthropic().messages.create,
    mode=instructor.Mode.ANTHROPIC_TOOLS
)

class EvaluationResult(BaseModel):
    is_correct: Optional[bool]

def evaluate_result(task: str, extracted_value: str) -> EvaluationResult:
    prompt = f"""
    You are an expert at evaluating whether an extracted value is correct or incorrect for a given task.
    The task that you are evaluating is JSON key-value extraction and financial calculations.
    The JSON contains financial statements, including income statements, balance sheets, and cash flow statements.
    Your job is to determine (True or False) if the extracted value or calculated result is correct for a given task.
    For example, if the task is to extract net_income from the JSON and the correct value is 10000, but the extracted value is 1000, then your output is False. However, if the extracted value is 10000, then your output is True.
    Note: The extracted values and calculated results may include both text and numbers/metrics. Your job is to focus only on the numbers/metrics and ensure that they match the ground truth values.
    Here are the ground truth values for the requested tasks:
    1. Revenue (FY2023): 9917000000
    2. Net Income Loss (FY2023): 4792000000
    3. Net Profit Margin (FY2023): 48.32%
    4. Total Assets (FY2023): 16038000000
    5. Total Liabilities (FY2023): 10478000000
    6. Debt-to-Assets Ratio (FY2023): 0.65
    7. Net Income (FY2023): 4792000000
    8. Plant, Property, and Equipment (FY2023): 160000000
    9. Free Cash Flow (FY2023): 4632000000
    """

    evaluation_result = anthropic_client(
        model="claude-3-opus-20240229",
        max_tokens=1024,
        max_retries=0,
        temperature=0.0,
        system=prompt,
        messages=[
            {
                "role": "user",
                "content": f"Evaluate whether the output is correct or incorrect. Task: {task} Extracted value: {extracted_value}",
            }
        ],
        response_model=EvaluationResult,
    )

    return evaluation_result

# Experimentation setup

In [None]:
def calculate_cost_per_token(cost_per_million_tokens):
    cost_per_token = cost_per_million_tokens / 1_000_000
    return cost_per_token

In [None]:
system_prompt = """
You are an expert at extracting key-value pairs from JSON and performing financial calculations.
The JSON contains financial statements, including income statements, balance sheets, and cash flow statements.
Your task is to extract specific values from the most recent fiscal year (FY2023) and perform the requested calculations.
Be concise, accurate, and correct in your answer. Present the results in a clear and structured format.
"""

message = f"""
Given the following JSON financial data: {financial_statements}

Please complete the following tasks for the most recent fiscal year (FY2023):

1. Extract the "revenue" value from the "income_statements".
2. Extract the "net_income_loss" value from the "income_statements".
3. Calculate the net profit margin by dividing the "net_income_loss" by the "revenue" and express the result as a percentage rounded to two decimal places.
4. Extract the "total_assets" value from the "balance_sheets".
5. Extract the "total_liabilities" value from the "balance_sheets".
6. Calculate the debt-to-assets ratio by dividing the "total_liabilities" by the "total_assets" and round the result to two decimal places.
7. Extract the "net_income" value from the "cash_flow_statements".
8. Extract the "plant_property_and_equipment" value from the "cash_flow_statements".
9. Calculate the free cash flow by subtracting the "plant_property_and_equipment" from the "net_income".

Present the extracted values and the calculated results in a clear and structured format.
"""

# Groq

In [None]:
!pip install -U -q groq

In [None]:
import getpass
import os

# Set your Groq API key
os.environ["GROQ_API_KEY"] = getpass.getpass()

In [None]:
import os
import time

from groq import Groq

client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

start_time = time.time()

def call_groq(query: str, model_name: str):
  return client.chat.completions.create(
      messages=[
          {
              "role": "system",
              "content": system_prompt,
          },
          {
              "role": "user",
              "content": message,
          },
      ],
      max_tokens=1000,
      temperature=0.0,
      model=model_name,
  )

In [None]:
# Define cost map
model_costs = {
    "mixtral-8x7b-32768": {
        "input_tokens": 0.27,
        "output_tokens": 0.27,
    },
}

In [None]:
for model_name, costs in model_costs.items():
    num_iterations = 10
    total_time = 0
    total_output_tokens = 0
    total_cost = 0
    evaluation_results = []

    for i in range(num_iterations):
        # Call the model
        start = time.time()
        response = call_groq(message, model_name=model_name)
        end = time.time()

        # Evaluate the result
        evaulation_result = evaluate_result(
            task=message,
            extracted_value=response.choices[0].message.content,
        )
        evaluation_results.append(evaulation_result)

        # Calculate costs
        input_cost = calculate_cost_per_token(costs["input_tokens"])
        output_cost = calculate_cost_per_token(costs["output_tokens"])
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        iteration_cost = (input_tokens * input_cost) + (output_tokens * output_cost)

        # Calculate time
        iteration_time = end - start

        # Calculate totals
        total_time += iteration_time
        total_output_tokens += output_tokens
        total_cost += iteration_cost

        # Prevent rate limiting
        time.sleep(1)

    avg_tokens_per_second = total_output_tokens / total_time
    avg_cost = total_cost / num_iterations

    print(f"Model: {model_name}")
    print(f"Average tokens per second: {avg_tokens_per_second:.2f}")
    print(f"Average total cost: ${avg_cost:.5f}")
    print(f"Evaluation results: {evaluation_results}")
    print()

# Anthropic

## Define code to call model

In [None]:
from anthropic import Anthropic

client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))

def call_anthropic(query: str, model_name: str) -> str:
  return client.messages.create(
      temperature=0.0,
      max_tokens=1000,
      messages=[
          {
              "role": "user",
              "content": message,
          },
      ],
      model=model_name,
  )

## Calculate cost and speed

In [None]:
# Define cost map
model_costs = {
    "claude-3-haiku-20240307": {
        "input_tokens": 0.25,
        "output_tokens": 1.25,
    },
    "claude-3-sonnet-20240229": {
        "input_tokens": 3.00,
        "output_tokens": 15.00,
    },
    "claude-3-opus-20240229": {
        "input_tokens": 15.00,
        "output_tokens": 75.00,
    },
}

In [None]:
for model_name, costs in model_costs.items():
    num_iterations = 10
    total_time = 0
    total_output_tokens = 0
    total_cost = 0
    evaluation_results = []

    for i in range(num_iterations):
        # Call the model
        start = time.time()
        response = call_anthropic(message, model_name=model_name)
        end = time.time()

        # Evaluate the result
        evaulation_result = evaluate_result(
            task=message,
            extracted_value=response.content,
        )
        evaluation_results.append(evaulation_result)

        # Calculate costs
        input_cost = calculate_cost_per_token(costs["input_tokens"])
        output_cost = calculate_cost_per_token(costs["output_tokens"])
        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens
        iteration_cost = (input_tokens * input_cost) + (output_tokens * output_cost)

        # Calculate time
        iteration_time = end - start

        # Calculate totals
        total_time += iteration_time
        total_output_tokens += output_tokens
        total_cost += iteration_cost

        # Prevent rate limiting
        time.sleep(1)

    avg_tokens_per_second = total_output_tokens / total_time
    avg_cost = total_cost / num_iterations

    print(f"Model: {model_name}")
    print(f"Average tokens per second: {avg_tokens_per_second:.2f}")
    print(f"Average total cost: ${avg_cost:.5f}")
    print(f"Evaluation results: {evaluation_results}")
    print()

# Cohere

In [None]:
!pip install -U -q cohere

In [None]:
# Set your Cohere API key
os.environ["COHERE_API_KEY"] = getpass.getpass()

In [None]:
import cohere

# Get your cohere API key on: www.cohere.com
co = cohere.Client(os.environ["COHERE_API_KEY"])

def call_cohere(query: str, model_name: str) -> str:
  return co.chat(
      message=query,
      max_tokens=1000,
      temperature=0.0,
  )

In [None]:
# Define cost map
model_costs = {
    "command-light": {
        "input_tokens": 0.30,
        "output_tokens": 0.60,
    },
    "command-r": {
        "input_tokens": 0.50,
        "output_tokens": 1.50,
    },
    "command-r-plus": {
        "input_tokens": 3.00,
        "output_tokens": 15.00,
    },
}

In [None]:
for model_name, costs in model_costs.items():
    num_iterations = 10
    total_time = 0
    total_output_tokens = 0
    total_cost = 0
    evaluation_results = []

    for i in range(num_iterations):
        # Call the model
        start = time.time()
        response = call_cohere(message, model_name=model_name)
        end = time.time()

        # Evaluate the result
        evaulation_result = evaluate_result(
            task=message,
            extracted_value=response.text,
        )
        evaluation_results.append(evaulation_result)

        # Calculate costs
        input_cost = calculate_cost_per_token(costs["input_tokens"])
        output_cost = calculate_cost_per_token(costs["output_tokens"])
        input_tokens = response.meta["billed_units"]["input_tokens"]
        output_tokens = response.meta["billed_units"]["output_tokens"]
        iteration_cost = (input_tokens * input_cost) + (output_tokens * output_cost)

        # Calculate time
        iteration_time = end - start

        # Calculate totals
        total_time += iteration_time
        total_output_tokens += output_tokens
        total_cost += iteration_cost

        # Prevent rate limiting
        time.sleep(1)

    avg_tokens_per_second = total_output_tokens / total_time
    avg_cost = total_cost / num_iterations

    print(f"Model: {model_name}")
    print(f"Average tokens per second: {avg_tokens_per_second:.2f}")
    print(f"Average total cost: ${avg_cost:.5f}")
    print(f"Evaluation results: {evaluation_results}")
    print()

# Mistral

In [None]:
!pip install -U -q mistralai

In [None]:
# Set your Mistral API key
os.environ["MISTRAL_API_KEY"] = getpass.getpass()

In [None]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"])

def call_mistral(query: str, model_name: str) -> str:
  return client.chat(
      model=model_name,
      max_tokens=1000,
      temperature=0.0,
      messages=[
        ChatMessage(role="user", content=query)
      ]
  )

In [None]:
# Define cost map
model_costs = {
    "mistral-small-2312": {
        "input_tokens": 0.70,
        "output_tokens": 0.70,
    },
    "mistral-large-2402": {
        "input_tokens": 8.00,
        "output_tokens": 24.00,
    },
}

In [None]:
for model_name, costs in model_costs.items():
    num_iterations = 10
    total_time = 0
    total_tokens_per_second = 0
    total_cost = 0
    evaluation_results = []

    for i in range(num_iterations):
        # Call the model
        start = time.time()
        response = call_mistral(message, model_name=model_name)
        end = time.time()

        # Evaluate the result
        evaulation_result = evaluate_result(
            task=message,
            extracted_value=response.choices[0].message.content,
        )
        evaluation_results.append(evaulation_result)

        # Calculate costs
        input_cost = calculate_cost_per_token(costs["input_tokens"])
        output_cost = calculate_cost_per_token(costs["output_tokens"])
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        iteration_cost = (input_tokens * input_cost) + (output_tokens * output_cost)

        # Calculate time
        iteration_time = end - start

        # Calculate totals
        total_time += iteration_time
        total_output_tokens += output_tokens
        total_cost += iteration_cost

        # Prevent rate limiting
        time.sleep(1)

    avg_tokens_per_second = total_output_tokens / total_time
    avg_cost = total_cost / num_iterations

    print(f"Model: {model_name}")
    print(f"Average tokens per second: {avg_tokens_per_second:.2f}")
    print(f"Average total cost: ${avg_cost:.5f}")
    print(f"Evaluation results: {evaluation_results}")
    print()

# OpenAI

In [None]:
!pip install -U -q openai

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def call_openai(query: str, model_name: str) -> str:
  return client.chat.completions.create(
      model=model_name,
      temperature=0,
      max_tokens=1000,
      messages=[
          {"role": "user", "content": query},
      ]
  )

In [None]:
# Define cost map
model_costs = {
    "gpt-3.5-turbo-0125": {
        "input_tokens": 0.50,
        "output_tokens": 1.50,
    },
    "gpt-4-0125-preview": {
        "input_tokens": 10.00,
        "output_tokens": 30.00,
    },
}

In [None]:
for model_name, costs in model_costs.items():
    num_iterations = 10
    total_time = 0
    total_output_tokens = 0
    total_cost = 0
    evaluation_results = []

    for i in range(num_iterations):
        # Call the model
        start = time.time()
        response = call_openai(message, model_name=model_name)
        end = time.time()

        print(response.choices[0].message.content)

        # Evaluate the result
        evaulation_result = evaluate_result(
            task=message,
            extracted_value=response.choices[0].message.content,
        )
        evaluation_results.append(evaulation_result)

        # Calculate costs
        input_cost = calculate_cost_per_token(costs["input_tokens"])
        output_cost = calculate_cost_per_token(costs["output_tokens"])
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        iteration_cost = (input_tokens * input_cost) + (output_tokens * output_cost)

        # Calculate time
        iteration_time = end - start

        # Calculate totals
        total_time += iteration_time
        total_output_tokens += output_tokens
        total_cost += iteration_cost

        # Prevent rate limiting
        time.sleep(1)

    avg_tokens_per_second = total_output_tokens / total_time
    avg_cost = total_cost / num_iterations

    print(f"Model: {model_name}")
    print(f"Average tokens per second: {avg_tokens_per_second:.2f}")
    print(f"Average total cost: ${avg_cost:.5f}")
    print(f"Evaluation results: {evaluation_results}")
    print()

# Gemini

In [None]:
!pip install -U -q google-generativeai google-cloud-aiplatform

In [None]:
os.environ["GEMINI_API_KEY"] = getpass.getpass()

In [None]:
import google.generativeai as genai

genai.configure(api_key=os.environ['GEMINI_API_KEY'])

def call_gemini(query: str, model: genai.GenerativeModel) -> str:
  return model.generate_content(message)

In [None]:
# Define cost map
model_costs = {
    "gemini-1.0-pro": {
        "input_tokens": 0.50,
        "output_tokens": 1.50,
    },
    "gemini-1.5-pro-latest": {
        "input_tokens": 7.00,
        "output_tokens": 21.00,
    },
}

In [None]:
for model_name, costs in model_costs.items():
    num_iterations = 10
    total_time = 0
    total_output_tokens = 0
    total_cost = 0
    evaluation_results = []

    for i in range(num_iterations):
        gemini = genai.GenerativeModel(model_name)

        # Call the model
        start = time.time()
        response = call_gemini(message, model=gemini)
        end = time.time()

        # Evaluate the result
        result = response.candidates[0].content

        print(result)
        evaulation_result = evaluate_result(
            task=message,
            extracted_value=result,
        )
        evaluation_results.append(evaulation_result)

        # Calculate costs
        input_cost = calculate_cost_per_token(costs["input_tokens"])
        output_cost = calculate_cost_per_token(costs["output_tokens"])
        input_tokens = float(gemini.count_tokens(message).total_tokens)
        output_tokens = float(gemini.count_tokens(result).total_tokens)
        iteration_cost = (input_tokens * input_cost) + (output_tokens * output_cost)

        # Calculate time
        iteration_time = end - start

        # Calculate totals
        total_time += iteration_time
        total_output_tokens += output_tokens
        total_cost += iteration_cost

        # Prevent rate limiting
        time.sleep(1)

    avg_tokens_per_second = total_output_tokens / total_time
    avg_cost = total_cost / num_iterations

    print(f"Model: {model_name}")
    print(f"Average tokens per second: {avg_tokens_per_second:.2f}")
    print(f"Average total cost: ${avg_cost:.5f}")
    print(f"Evaluation results: {evaluation_results}")
    print()