In [1]:
# Setup and Imports
from typing import List, Dict, Any
from dotenv import load_dotenv
from langsmith import Client
from openai import OpenAI
from tqdm import tqdm
import json
from langsmith.wrappers import wrap_openai

load_dotenv()
base_client = OpenAI()
openai_client = wrap_openai(base_client)
langsmith_client = Client()

In [2]:
# Split the document into large, overlapping chunks. We're not using
# the chunks for retrieval, so they don't need to be small.

# In fact, since we're using the chunks to generate evaluation
# questions, it's better if they have more context.

# Note: we don't use the markdown parser because it splits the
# chunks by section, and we want to split by chunk size.

from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import FlatReader

docs = SimpleDirectoryReader(
    input_files=["./data/Tesla Cybertruck Owners Manual.md"],
    filename_as_id=True,
    file_extractor={
        ".md": FlatReader()
    }
).load_data()

from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(
    chunk_size=2000,
    chunk_overlap=400
)

nodes = parser.get_nodes_from_documents(docs)

In [3]:
# Inspect a few of the chunks created by the parser

print(f"\nTotal number of chunks: {len(nodes)}\n\n")

print("\nFirst 3 chunks:")
for i in range(min(3, len(nodes))):
    print(f"\n--- Chunk {i+1} ---")
    print(nodes[i].text)
    print("\n" + "="*80)


Total number of chunks: 112



First 3 chunks:

--- Chunk 1 ---
Tesla logo

# OWNER'S MANUAL

Image of Tesla Cybertruck

Software version: 2024.33.15
North America
---
# YOUR OWNER'S MANUAL

For the latest and greatest information that is customized to your vehicle, view the Owner's Manual on your vehicle's touchscreen by touching the app launcher and then selecting the Manual app. The information is specific to your vehicle depending on the features you purchased, vehicle configuration, market region, and software version. In contrast, owner information that is provided by Tesla elsewhere is updated as necessary and may not contain information unique to your vehicle.

## RELEASE NOTES

Information about new features is displayed on the touchscreen after a software update, and can be viewed at any time by choosing the Release Notes tab in the Manual app, or by touching Controls > Software > Release Notes. If the content in the Owner's Manual on how to use your vehicle conflicts with i

In [4]:
# Leverage multiple prompts to generate a diverse set of questions

FACTUAL_PROMPT = """Generate 2-3 questions that real Cybertruck owners would actually type into a search bar or ask in an owners' forum. These should feel completely natural and conversational.

Write questions as if they were being typed into a search bar or asked in a forum. For example:

Instead of:
- "How do I activate the climate control system?"
- "What should I do if the touchscreen becomes unresponsive?"
- "How does one optimize range in cold weather?"

Write:
- "how to turn on AC in cybertruck"
- "screen frozen - what now?"
- "battery draining fast in cold weather"

Make questions feel real by:
1. Using natural search patterns
   - "how to..."
   - "why is my..."
   - "help with..."
2. Including context and emotion
   - "stuck at supercharger"
   - "help! frunk won't open"
   - "confused about ride height settings"
3. Writing like real people
   - Use contractions (I'm, won't, can't)
   - OK to use incomplete sentences
   - Include emotional context ("Help!", "Confused about...", "Worried about...")
4. Adding situational details
   - "in rain"
   - "with kids"
   - "while camping"

For each question, evaluate its real-world relevance:
- "common": Everyday, urgent needs:
  * "trunk won't close"
  * "phone key not working"
  * "what's this warning light mean"

- "rare": Occasional situations:
  * "winterizing cybertruck"
  * "car wash settings?"
  * "towing setup help"

- "unlikely": Technical/administrative:
  * Manual details
  * Specifications
  * Legal info

Text: {text}

Provide your response in the following JSON format:
{{
    "questions": [
        {{
            "question": "Natural, search-like question",
            "answer": "Clear, helpful answer",
            "supporting_text": "Relevant excerpt from source text",
            "question_type": "factual",
            "relevance_level": "common|rare|unlikely",
            "relevance_reasoning": "Brief explanation of why this question fits the chosen relevance level"
        }}
    ]
}}"""

REASONING_PROMPT = """Generate 2-3 questions that real Cybertruck owners would ask when trying to understand how features work together or make decisions about using their vehicle. These should feel like real forum posts or search queries.

Write questions as if they were being posted in an owners' forum. For example:

Instead of:
- "What is the optimal charging strategy?"
- "How does ambient temperature affect range?"
- "What are the considerations for child safety?"

Write:
- "best way to charge for long road trip?"
- "losing tons of range in cold - what helps?"
- "safest seats for car seats?"

Make questions feel real by:
1. Using natural patterns
   - "better to..."
   - "best way to..."
   - "tips for..."
2. Including context and emotion
   - "worried about range"
   - "confused about charging"
   - "need advice on settings"
3. Writing like real people
   - Use contractions (I'm, won't, can't)
   - OK to use incomplete sentences
   - Include emotional context ("Help!", "Confused about...", "Worried about...")
4. Adding situational details
   - "for camping"
   - "in winter"
   - "with full family"

For each question, evaluate its real-world relevance:
- "common": Everyday decisions:
  * "faster charging vs battery life?"
  * "seat heaters or cabin heat?"
  * "best settings for commute"

- "rare": Occasional planning:
  * "road trip planning help"
  * "winter driving tips"
  * "towing affects on range?"

- "unlikely": Technical/theoretical:
  * System details
  * Technical specs
  * Legal considerations

Text: {text}

Provide your response in the following JSON format:
{{
    "questions": [
        {{
            "question": "Natural, forum-style question",
            "answer": "Practical, helpful answer",
            "supporting_text": "Relevant excerpt from source text",
            "question_type": "reasoning",
            "relevance_level": "common|rare|unlikely",
            "relevance_reasoning": "Brief explanation of why this question fits the chosen relevance level"
        }}
    ]
}}"""

In [5]:
# Generate a small number of questions to test the prompts
# After inspecting the generated questions, we can adjust the prompts,
# if necessary.

response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a skilled question generator."},
        {"role": "user", "content": FACTUAL_PROMPT.format(text=nodes[0].text)}
    ],
    response_format={"type": "json_object"}
)

factual_questions = json.loads(response.choices[0].message.content)
print("Factual Questions Generated:")
print(json.dumps(factual_questions, indent=2))

# Test reasoning questions
response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a skilled question generator."},
        {"role": "user", "content": REASONING_PROMPT.format(text=nodes[0].text)}
    ],
    response_format={"type": "json_object"}
)

reasoning_questions = json.loads(response.choices[0].message.content)
print("\nReasoning Questions Generated:")
print(json.dumps(reasoning_questions, indent=2))

Factual Questions Generated:
{
  "questions": [
    {
      "question": "how to open frunk on cybertruck if battery is dead",
      "answer": "To open the powered frunk with no power, refer to page 236 of your Owner's Manual for the specific steps required.",
      "supporting_text": "Opening the Powered Frunk with No Power...................... 236",
      "question_type": "factual",
      "relevance_level": "common",
      "relevance_reasoning": "The question is common as it's important and urgent for users who may experience a dead battery and need access to their vehicle's frunk for emergency equipment or charging cables."
    },
    {
      "question": "cybertruck ac not working, what should i check?",
      "answer": "Check the climate controls operation by navigating to page 154 in the Owner's Manual. Ensure all settings are correct and see if there\u2019s any indicator of a malfunction.",
      "supporting_text": "Operating Climate Controls......................................

In [6]:
# Sample random chunks of the document to generate evaluation
# questions.

import random
NUM_CHUNKS = 10

random_chunks = random.sample(nodes, NUM_CHUNKS)
print(f"Selected {len(random_chunks)} random chunks")

# 2. Generate Questions for Random Chunks
candidate_examples = []

for node in tqdm(random_chunks):
    # Generate factual questions
    factual_response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a skilled question generator."},
            {"role": "user", "content": FACTUAL_PROMPT.format(text=node.text)}
        ],
        response_format={"type": "json_object"}
    )
    
    # Generate reasoning questions
    reasoning_response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a skilled question generator."},
            {"role": "user", "content": REASONING_PROMPT.format(text=node.text)}
        ],
        response_format={"type": "json_object"}
    )
    
    # Parse responses
    factual_questions = json.loads(factual_response.choices[0].message.content)["questions"]
    reasoning_questions = json.loads(reasoning_response.choices[0].message.content)["questions"]
    
    # Format and store
    for question in factual_questions + reasoning_questions:
        example = {
            "question": question["question"],
            "answer": question["answer"],
            "metadata": {
                "chunk_id": node.node_id,
                "question_type": question["question_type"],
                "supporting_text": question["supporting_text"],
                "relevance_level": question["relevance_level"],
                "source_position": node.start_char_idx if hasattr(node, 'start_char_idx') else None,
                "filename": node.metadata.get("filename", "unknown")
            }
        }
        candidate_examples.append(example)

# 3. Filter for Common Questions
common_examples = [ex for ex in candidate_examples 
                  if ex["metadata"]["relevance_level"] == "common"]
print(f"\nFound {len(common_examples)} common questions")

Selected 10 random chunks


100%|██████████| 10/10 [02:49<00:00, 16.91s/it]


Found 39 common questions





In [7]:
# Print selected examples for inspection

import textwrap

print("\Generated common examples:")
print("=" * 80)

for i, example in enumerate(common_examples, 1):
    print(f"\nExample {i}:")
    print("-" * 40)
    print(f"Question: {example['question']}")
    print(f"Answer: {example['answer']}")
    print("\nMetadata:")
    print(f"  Question Type: {example['metadata']['question_type']}")
    print(f"  Relevance Level: {example['metadata']['relevance_level']}")
    print(f"  Source File: {example['metadata']['filename']}")
    print(f"  Chunk ID: {example['metadata']['chunk_id']}")
    print("\nSupporting Text:")
    print(textwrap.fill(example['metadata']['supporting_text'], width=70))
    print("=" * 80)


\Generated common examples:

Example 1:
----------------------------------------
Question: how to fix stiff brake pedal cybertruck
Answer: If your brake pedal feels stiffer than usual, it could be due to a brake booster failure. Apply steady force to the brake pedal without releasing or pumping, and drive cautiously while maintaining a safe distance from other vehicles.

Metadata:
  Question Type: factual
  Relevance Level: common
  Source File: Tesla Cybertruck Owners Manual.md
  Chunk ID: 1451c3f2-dc0b-419e-ba71-44c71ecb53b6

Supporting Text:
Hydraulic boost compensation provides mechanical assistance if the
brake booster fails. If a brake booster failure is detected, the brake
pedal feels stiffer to press.

Example 2:
----------------------------------------
Question: confused about regenerative braking in cybertruck
Answer: Regenerative braking helps slow down the Cybertruck by feeding power back to the battery when your foot is off the accelerator. The deceleration might vary depe

  print("\Generated common examples:")


In [8]:
# Add the generated questions to the Langsmith dataset. Note:
# first, create the dataset in Langsmith UI, and apply the chat schema.

dataset_name = "rag_evaluation_dataset"

for example in tqdm(common_examples):
    # Format as chat input/output
    input_data = {
        "messages": [
            {"role": "user", "content": example["question"]}
        ]
    }
    
    output_data = {
        "message": {
            "role": "assistant", 
            "content": example["answer"]
        }
    }

    langsmith_client.create_example(
        dataset_name=dataset_name,
        inputs=input_data,
        outputs=output_data,
        metadata=example["metadata"]
    )

print("Complete!")

100%|██████████| 39/39 [00:26<00:00,  1.48it/s]

Complete!





In [9]:
from openai import OpenAI
from langsmith.wrappers import wrap_openai
from langsmith import traceable, evaluate

from dotenv import load_dotenv
load_dotenv()

client = wrap_openai(OpenAI())

In [12]:
import os

from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
import weaviate
from weaviate.classes.init import Auth

wcd_url = os.environ["WCD_URL"]
wcd_api_key = os.environ["WCD_API_KEY"]

weaviate_client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    
    auth_credentials=Auth.api_key(wcd_api_key),
)

vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, index_name="Tesla")
index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine()
retriever = index.as_retriever()

In [34]:
@traceable
def agent(inputs: dict) -> dict:
    chunks = retriever.retrieve(inputs["messages"][0]["content"])
    context = ""
    for chunk in chunks:
        context += chunk.text

    messages = [
        {"role": "user", "content": context},
        *inputs["messages"]
    ]
    print(messages)
    result = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=0.2
    )
    print(result)
    return {
        "message": {
            "role": "assistant",
            "content": result.choices[0].message.content
        }
    }

In [17]:
def correctness_evaluator(run, example) -> dict:
    """
    Evaluates the correctness of generated unit tests
    
    Args:
        run: Contains the run information including inputs and outputs
        example: Contains the reference example if available
    
    Returns:
        Dictionary with score (0-1) and explanation
    """

    question = run.inputs["inputs"]["messages"][-1]["content"]
    
    # Extract the model's generated tests
    answer = run.outputs["message"]["content"]
    
    # Rest of the evaluation logic remains the same
    evaluation_prompt = f"""
    Given this question from an owner of a Tesla Cyber Truck
    {question}

    And this answer from the owner's manual
    {answer}
    
    Score from 0-4:
    4 = the answer matches perfectly and varies only in choice of language and/or grammaer
    3 = the answer is functionally correct but is missing important data, or includes extraneous or slightly incorrect data
    2 = the answer is on the right track and bears some similarity to the right answer, but is wrong in important aspects or missing data
    1 = the answer is wrong or does not answer the question
    0 = the answer is misleading and not only doesn't answer the question, but confuses the reader
    
    Return only the number (0-4).
    """
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a test evaluation assistant. Respond only with a number 0-4."},
            {"role": "user", "content": evaluation_prompt}
        ],
        temperature=0
    )
    
    try:
        score = int(response.choices[0].message.content.strip())
        return {
            "key": "correctness score",
            "score": score / 4,  # Normalize to 0-1
            "explanation": f"Test correctness score: {score}/4"
        }
    except ValueError:
        return {
            "key": "correctness score",
            "score": 0,
            "explanation": "Failed to parse score"
        }


In [35]:
# Evaluate the target task
results = evaluate(
    agent,
    data="rag_evaluation_dataset",
    evaluators=[correctness_evaluator],
    experiment_prefix="w2m5"
)

View the evaluation results for experiment: 'w2m5-0c0a15d0' at:
https://smith.langchain.com/o/86d4db5d-f48f-4b03-bccd-c0b77209468a/datasets/5e00640d-3b08-4b2d-93b3-c1f0c8a737bd/compare?selectedSessions=b84d55e7-126c-4603-88c8-af8cf4eb4d51




0it [00:00, ?it/s]

HERE worried about scratches on stainless steel - how to fix?
[{'role': 'user', 'content': 'Dents and Scratches\n\nThe stainless steel exterior of Cybertruck is more resistant to dents and dings than most other vehicles. However, Cybertruck does not have a clear coat on the surface of the exterior body panels, meaning any scratches that appear are in the stainless steel panels themselves. Anyone performing scratch repair should refer to the applicable "Exterior Stainless Steel Panel Refinishing" procedure within the Collision Repair Manual on service.tesla.com. In addition, do not use, and/or immediately remove, chemical, corrosive, or non-pH neutral substances (including but not limited to: acidic liquids or materials, grease, oil, tree resin, dead insects, tar spots, road salt, industrial fallout, etc.) as they can cause corrosion on the vehicle\'s exterior.\n\n> CAUTION: Tesla is not liable for any damage caused by failing to refer to official guidance.Dashboard and Plastic Surfaces