In [12]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")
#print(openai_api_key)
llm = ChatOpenAI(api_key = openai_api_key,
                model = "gpt-4o", 
                 temperature = 0, 
                 max_tokens = None,
                timeout = None,
                max_retries = 2)

messages = [
    (
        "system",
        "You are a Question answering machine",
    ),
    (
        "user",
        "How LLM agent for robotics works?"
    ),
]

print(llm.invoke(messages))

content="A Large Language Model (LLM) agent for robotics integrates natural language processing capabilities with robotic systems to enable more intuitive and flexible interactions between humans and robots. Here's a general overview of how such an agent might work:\n\n1. **Natural Language Understanding (NLU):** The LLM is used to process and understand human language inputs. This involves parsing commands, questions, or descriptions provided by a user. The model interprets the intent behind the language, extracting relevant information and context.\n\n2. **Task Planning and Decision Making:** Once the LLM understands the user's intent, it translates this into actionable tasks for the robot. This involves decision-making processes where the agent determines the sequence of actions required to achieve the desired outcome. This step may involve integrating with other AI models or systems that specialize in planning and decision-making.\n\n3. **Robotic Control and Execution:** The LLM ag

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

loader = PyPDFLoader("data/Agent AI_Surveying the horizons of multimodal interaction.pdf")
document = loader.load()


In [6]:
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
texts = text_splitter.split_documents(document)
print(f"created {len(texts)} chunks")

created 80 chunks


In [None]:
texts[:3]

In [8]:
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002")
persist_directory = "chroma_db"

db = Chroma.from_documents(documents = texts, embedding = embeddings, persist_directory = persist_directory)

In [None]:
db.get()['documents']

In [11]:
import os
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

qa = RetrievalQA.from_chain_type(llm = llm, chain_type= "stuff", retriever = db.as_retriever())

response = qa.invoke("How LLM agent for robotics works")

print(response)

{'query': 'How LLM agent for robotics works', 'result': 'LLM agents for robotics work by leveraging the advanced language processing capabilities of large language models (LLMs) to interpret instructions and decompose them into actionable steps for robots. These agents are integrated into robotic systems to enhance various aspects of robot interaction and task execution. Here are some key components and functionalities of LLM agents in robotics:\n\n1. **Multimodal Systems**: LLMs are used in conjunction with visual language models (VLMs) to process both linguistic instructions and visual cues. This integration allows robots to understand and act upon complex instructions that involve both language and visual information.\n\n2. **Task Planning and Skill Training**: LLMs assist in high-level task planning by interpreting instructions and breaking them down into specific robot actions. They are also used in designing reward functions and generating data to facilitate policy learning, cont

In [13]:
query = "How LLM agent for robotics works?"
docs = db.similarity_search(query, k = 5)
print(docs[0].page_content)

Agent AI:
Surveying the Horizons of Multimodal Interaction A PREPRINT
While these elements—precise contact points and arm posture—are intuitive for humans, articulating them through
language is challenging. Despite advances in internet-scale VLMs, capturing these nuanced indirect cues from scenes
and translating them effectively into robotic skills remains a significant challenge. In response, the robotics community
is increasingly focusing on collecting enhanced datasets(e.g., (Wang et al., 2023d; Padalkar et al., 2023)) or developing
methodologies for direct skill acquisition from human demonstrations (Wake et al., 2021a). Frameworks including
Learning-from-Demonstration and Imitation Learning are leading these developments, playing a crucial role in the
optimization of physical skills.
6.2.1 LLM/VLM Agent for Robotics.
Recent research has demonstrated the potential of LLM/VLMs for robotic agents that involve interactions with humans
in an environment. Research topics that aim to lev