In [3]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")
#print(openai_api_key)
llm = ChatOpenAI(api_key = openai_api_key,
                model = "gpt-4o", 
                 temperature = 0, 
                 max_tokens = None,
                timeout = None,
                max_retries = 2)

messages = [
    (
        "system",
        "You are a Question answering machine",
    ),
    (
        "user",
        "tell me something about Beijing Guoan soccer club?"
    ),
]

print(llm.invoke(messages))

content="Beijing Guoan Football Club is a professional soccer team based in Beijing, China. Founded in 1992, the club is one of the most prominent teams in the Chinese Super League (CSL). Beijing Guoan is known for its passionate fan base and has a strong presence in Chinese football.\n\nThe team plays its home matches at the Workers' Stadium, which is located in the Chaoyang District of Beijing. The stadium has a significant capacity and is one of the iconic sports venues in the city.\n\nBeijing Guoan has enjoyed success in domestic competitions, having won the Chinese Super League title and several domestic cups. The club is also known for its competitive performances in the AFC Champions League, representing Chinese football on the continental stage.\n\nThe team's colors are traditionally green and white, and its mascot is a lion, symbolizing strength and courage. Over the years, Beijing Guoan has attracted several high-profile players and coaches, both domestic and international, c

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

loader = PyPDFLoader("data/Agent AI_Surveying the horizons of multimodal interaction.pdf")
document = loader.load()


In [5]:
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
texts = text_splitter.split_documents(document)
print(f"created {len(texts)} chunks")

created 80 chunks


In [None]:
texts[:3]

In [6]:
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002")
persist_directory = "chroma_db"

db = Chroma.from_documents(documents = texts, embedding = embeddings, persist_directory = persist_directory)

In [None]:
db.get()['documents']

In [7]:
import os
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

qa = RetrievalQA.from_chain_type(llm = llm, chain_type= "stuff", retriever = db.as_retriever())

response = qa.invoke("How LLM agent for robotics works")

print(response)

{'query': 'How LLM agent for robotics works', 'result': "LLM (Large Language Model) agents for robotics work by leveraging advanced language processing capabilities to interpret instructions and decompose them into actionable steps for robots. These agents are integrated into various aspects of robotic systems to enhance their functionality and interaction with humans. Here's how they work in different contexts:\n\n1. **Multimodal Systems**: LLMs are used as encoders to process both linguistic instructions and visual cues, guiding robotic actions effectively. This integration allows robots to understand and act upon complex instructions that involve multiple types of input.\n\n2. **Task Planning and Skill Training**: LLMs help in high-level task planning by interpreting instructions and breaking them down into specific robot actions. They are also used in designing reward functions and generating data for policy learning, which aids in training efficient robot controllers.\n\n3. **On-s

In [13]:
query = "How LLM agent for robotics works?"
docs = db.similarity_search(query, k = 5)
print(docs[0].page_content)

Agent AI:
Surveying the Horizons of Multimodal Interaction A PREPRINT
While these elements—precise contact points and arm posture—are intuitive for humans, articulating them through
language is challenging. Despite advances in internet-scale VLMs, capturing these nuanced indirect cues from scenes
and translating them effectively into robotic skills remains a significant challenge. In response, the robotics community
is increasingly focusing on collecting enhanced datasets(e.g., (Wang et al., 2023d; Padalkar et al., 2023)) or developing
methodologies for direct skill acquisition from human demonstrations (Wake et al., 2021a). Frameworks including
Learning-from-Demonstration and Imitation Learning are leading these developments, playing a crucial role in the
optimization of physical skills.
6.2.1 LLM/VLM Agent for Robotics.
Recent research has demonstrated the potential of LLM/VLMs for robotic agents that involve interactions with humans
in an environment. Research topics that aim to lev