In [2]:
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Error handling https://stackoverflow.com/questions/76958817/streamlit-your-system-has-an-unsupported-version-of-sqlite3-chroma-requires-sq
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

# Import chromadb after updating sqlite3 https://stackoverflow.com/questions/76921252/attributeerror-module-chromadb-has-no-attribute-config
from langchain.vectorstores import Chroma

In [3]:
load_dotenv()

True

In [4]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Define our LLM

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
# llm.invoke("Tell me a joke about cats")

## Process PDF document

### Load PDF document

In [6]:
loader = PyPDFLoader("data/ICRA25_0661_MS.pdf")
pages = loader.load()
pages

[Document(metadata={'source': 'data/ICRA25_0661_MS.pdf', 'page': 0}, page_content='Self-Supervised Multimodal NeRF for Autonomous Driving\nGaurav Sharma1, Ravi Kothari 2, Dr. Josef Schmid 3\nAbstract— In this paper, we propose a Neural Radiance Fields\n(NeRF) based framework, referred to as Novel View Synthesis\nFramework (NVSF). It jointly learns the implicit neural rep-\nresentation of space and time-varying scene for both LiDAR\nand Camera. We test this on a real-world autonomous driving\nscenario containing both static and dynamic scenes. Compared\nto existing multimodal dynamic NeRFs, our framework is self-\nsupervised, thus eliminating the need for 3D labels. For efficient\ntraining and faster convergence, we introduce heuristic based\nimage pixel sampling to focus on pixels with rich information.\nTo preserve the local features of LiDAR points, a Double\nGradient based mask is employed. Extensive experiments on\nthe KITTI-360 dataset show that, compared to the baseline\nmodels, 

### Split document

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

In [8]:
chunks[0].page_content

'Self-Supervised Multimodal NeRF for Autonomous Driving\nGaurav Sharma1, Ravi Kothari 2, Dr. Josef Schmid 3\nAbstract— In this paper, we propose a Neural Radiance Fields\n(NeRF) based framework, referred to as Novel View Synthesis\nFramework (NVSF). It jointly learns the implicit neural rep-\nresentation of space and time-varying scene for both LiDAR\nand Camera. We test this on a real-world autonomous driving\nscenario containing both static and dynamic scenes. Compared\nto existing multimodal dynamic NeRFs, our framework is self-\nsupervised, thus eliminating the need for 3D labels. For efficient\ntraining and faster convergence, we introduce heuristic based\nimage pixel sampling to focus on pixels with rich information.\nTo preserve the local features of LiDAR points, a Double\nGradient based mask is employed. Extensive experiments on\nthe KITTI-360 dataset show that, compared to the baseline\nmodels, our Framework has reported best performance on both\nLiDAR and Camera domain. Code

### Create embeddings

In [9]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
# test_vector = embedding_function.embed_query("cat")

In [None]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

evaluator.evaluate_strings(prediction="Amsterdam", reference="Netherlands")

In [None]:
evaluator.evaluate_strings(prediction="Paris", reference="Netherlands")

### Create vector database

In [21]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk)
        else:
            print('Dropping duplicate chunk')
    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    vectorstore.persist()
    
    return vectorstore

In [14]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_nerf")

## 2. Query for relevant data

In [15]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_nerf", embedding_function=embedding_function)

In [23]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the novelty of the paper?")
relevant_chunks

[Document(metadata={'page': 0, 'source': 'data/ICRA25_0661_MS.pdf'}, page_content='and rigid objects such as pedestrians, cars, trucks, etc. The\nproposed framework empowers the novel view synthesis\nof LiDAR and camera in real-world driving scenarios of\nA Vs, supporting diverse applications such as minimizing\ndomain gaps, domain invariant training, experimenting with\ndifferent sensor specifications, sensor configurations on A V\nand revival of corrupted data.\nOur contributions are threefold:\n1) We propose a multimodal self-supervised NVS frame-\nwork which jointly learns the implicit Neural represen-\ntation of a 4D spatio-temporal scene for LiDAR and\nCamera\n2) We introduce a Heuristic pixel sampling method using\nMultinomial Probability Distribution based on recon-\nstruction error of image pixels during training\n3) To improve feature alignment in synthesised LiDAR\nCONFIDENTIAL. Limited circulation. For review only.\nManuscript 661 submitted to 2025 IEEE International Confer

In [24]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## 3. Generate responses

In [25]:
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

In [26]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What is the novelty of the paper?")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

and rigid objects such as pedestrians, cars, trucks, etc. The
proposed framework empowers the novel view synthesis
of LiDAR and camera in real-world driving scenarios of
A Vs, supporting diverse applications such as minimizing
domain gaps, domain invariant training, experimenting with
different sensor specifications, sensor configurations on A V
and revival of corrupted data.
Our contributions are threefold:
1) We propose a multimodal self-supervised NVS frame-
work which jointly learns the implicit Neural represen-
tation of a 4D spatio-temporal scene for LiDAR and
Camera
2) We introduce a Heuristic pixel sampling method using
Multinomial Probability Distribution based on recon-
struction error of image pixels during training
3) To improve feature alignment in synthesised LiDAR
C

In [27]:
llm.invoke(prompt)

AIMessage(content='The novelty of the paper lies in the proposed multimodal self-supervised Novel View Synthesis (NVS) framework, which jointly learns the implicit neural representation of a 4D spatio-temporal scene for both LiDAR and camera data. The key contributions include:\n\n1. A multimodal self-supervised NVS framework that integrates data from LiDAR and camera to enhance scene representation.\n2. A heuristic pixel sampling method based on reconstruction error to improve training efficiency.\n3. Improved feature alignment in synthesized LiDAR data, demonstrating that multimodal NVS can effectively share a Neural Radiance Field (NeRF) without compromising individual performance.\n\nThe framework shows significant improvements in feature representation, consistency, and fidelity of synthesized point clouds compared to baseline models, addressing the challenges of accurately mapping the physics of active sensors and minimizing the sim-to-real domain gap.', additional_kwargs={'refus

### Using Langchain Expression Language

In [30]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What's the title of this paper?")

AIMessage(content="The title of the paper is not provided in the retrieved context. Therefore, I don't know the title of the paper.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 1220, 'total_tokens': 1244, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0705bf87c0', 'finish_reason': 'stop', 'logprobs': None}, id='run-b137c24a-5a54-4b47-95bd-fd20fe5c92d9-0', usage_metadata={'input_tokens': 1220, 'output_tokens': 24, 'total_tokens': 1244})

### Generate structured responses

In [28]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

In [31]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")

ExtractedInfo(paper_title=AnswerWithSources(answer='Multimodal Self-Supervised Neural View Synthesis Framework for LiDAR and Camera', sources='The proposed framework empowers the novel view synthesis of LiDAR and camera in real-world driving scenarios of A Vs.', reasoning="The title is derived from the context that describes the framework's capabilities in synthesizing views from LiDAR and camera."), paper_summary=AnswerWithSources(answer='The paper proposes a multimodal self-supervised neural view synthesis (NVS) framework that jointly learns the implicit neural representation of a 4D spatio-temporal scene for LiDAR and camera data. It introduces a heuristic pixel sampling method based on reconstruction error and improves feature alignment in synthesized LiDAR data, demonstrating significant improvements in performance metrics.', sources='Our contributions are threefold: 1) We propose a multimodal self-supervised NVS framework which jointly learns the implicit Neural representation of

### Transform response into a dataframe

In [32]:
structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")
df = pd.DataFrame([structured_response.dict()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,Multimodal Self-Supervised Neural View Synthes...,The paper proposes a multimodal self-supervise...,2025,"X. Li, J. Kaesemodel Pontes, and S. Lucey"
source,The proposed framework empowers the novel view...,Our contributions are threefold: 1) We propose...,Manuscript 661 submitted to 2025 IEEE Internat...,"[30] X. Li, J. Kaesemodel Pontes, and S. Lucey..."
reasoning,The title is inferred from the context discuss...,The summary encapsulates the main contribution...,The publication year is stated directly in the...,The authors are mentioned in the context relat...
