# Joint Tabular/Semantic QA over Tesla 10K

In this example, we show how to ask questions over 10K with understanding of both the unstructured text as well as embedded tables.

We use Unstructured to parse out the tables, and use LlamaIndex recursive retrieval to index/retrieve tables if necessary given the user question.

In [50]:
from pydantic import BaseModel
from unstructured.partition.html import partition_html
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

## Perform Data Extraction

In these sections we use Unstructured to parse out the table and non-table elements.

### Extract Elements

We use Unstructured to extract table and non-table elements from the 10-K filing.

In [None]:
!wget "https://www.dropbox.com/scl/fi/mlaymdy1ni1ovyeykhhuk/tesla_2021_10k.htm?rlkey=qf9k4zn0ejrbm716j0gg7r802&dl=1" -O tesla_2021_10k.htm
!wget "https://www.dropbox.com/scl/fi/rkw0u959yb4w8vlzz76sa/tesla_2020_10k.htm?rlkey=tfkdshswpoupav5tqigwz1mp7&dl=1" -O tesla_2020_10k.htm

In [3]:
from typing import Any, Optional
import pandas as pd


class Element(BaseModel):
    id: str
    type: str
    element: Any
    summary: Optional[str] = None
    table: Optional[pd.DataFrame] = None

    class Config:
        arbitrary_types_allowed = True

In [4]:
from lxml import html
import pandas as pd


def html_to_df(html_str):
    # print(html_str)
    tree = html.fromstring(html_str)
    # print(tree.xpath('//table'))
    table_element = tree.xpath("//table")[0]
    rows = table_element.xpath(".//tr")

    data = []
    for row in rows:
        cols = row.xpath(".//td")
        cols = [c.text.strip() if c.text is not None else "" for c in cols]
        data.append(cols)

    df = pd.DataFrame(data[1:], columns=data[0])
    return df

In [5]:
# simple heuristic to filter the table (if there's only one row or one column)
def filter_table(table_element):
    table_df = html_to_df(table_element.metadata.text_as_html)
    if len(table_df) <= 1 or len(table_df.columns) <= 1:
        return False
    else:
        return True

In [6]:
def extract_elements(filename, table_filters=[]):
    elements = partition_html(filename=filename)
    output_els = []
    for idx, element in enumerate(elements):
        if "unstructured.documents.html.HTMLTable" in str(type(element)):
            should_keep = all([tf(element) for tf in table_filters])
            if should_keep:
                table_df = html_to_df(str(element.metadata.text_as_html))
                output_els.append(
                    Element(
                        id=f"id_{idx}", type="table", element=element, table=table_df
                    )
                )
            else:
                pass
        else:
            output_els.append(Element(id=f"id_{idx}", type="text", element=element))
    return output_els

In [7]:
def get_table_elements(elements):
    return [e for e in elements if e.type == "table"]


def get_text_elements(elements):
    return [e for e in elements if e.type == "text"]

In [8]:
elements = extract_elements("tesla_2021_10k.htm", table_filters=[filter_table])

In [9]:
table_elements = get_table_elements(elements)
text_elements = get_text_elements(elements)

In [10]:
len(table_elements)

105

### Summarize Tables

We specifically go through tables and use LlamaIndex to help extract a summary.

In [11]:
from llama_index import SummaryIndex, Document
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from tqdm.notebook import tqdm

llm = OpenAI(model="gpt-4")

system_prompt = """\
You are an assistant designed to extract insights from messy tables in a financial report.

You are also designed to filter out "tables" that are not useful to keep. For instance, if the table \
is a wrongfully extracted piece of text, or does not contain any useful information.
"""

service_context = ServiceContext.from_defaults(system_prompt=system_prompt, llm=llm)

In [12]:
class TableOutput(BaseModel):
    """Output from analyzing a table."""

    summary: str
    should_keep: bool


def extract_table_summaries(elements):
    """Go through elements, extract out summaries that are tables."""
    for element in tqdm(elements):
        if element.type != "table":
            continue
        index = SummaryIndex.from_documents([Document(text=str(element.element))])
        query_engine = index.as_query_engine(output_cls=TableOutput)
        query_str = """\
What is this table about? Give a very concise summary (imagine you are adding a caption), \
and also output whether or not the table should be kept.
"""
        response = query_engine.query(query_str)
        element.summary = response.response.summary

In [13]:
extract_table_summaries(table_elements)

  0%|          | 0/105 [00:00<?, ?it/s]

In [14]:
table_elements[0].summary

'Delaware 91-2197729'

In [16]:
# [optional] save
import pickle

In [17]:
pickle.dump(elements, open("elements.pkl", "wb"))

In [18]:
# [optional] load

elements = pickle.load(open("elements.pkl", "rb"))

## Setup Recursive Retriever

Now that we've extracted tables and their summaries, we can setup a recursive retriever in LlamaIndex to query these tables.

### Create Nodes

In [19]:
from llama_index.schema import TextNode, IndexNode
from llama_index.node_parser import SimpleNodeParser

In [20]:
# join all non-table elements into

In [68]:
def _get_nodes_from_buffer(buffer, node_parser):
    doc = Document(text="\n\n".join([t for t in buffer]))
    nodes = node_parser.get_nodes_from_documents([doc])
    return nodes


def get_nodes_and_mappings(elements):
    pd.options.display.max_columns = None
    node_parser = SimpleNodeParser.from_defaults()

    nodes = []
    node_mappings = {}
    other_mappings = {}
    cur_text_el_buffer = []
    for element in elements:
        if element.type == "table":
            # flush text buffer
            if len(cur_text_el_buffer) > 0:
                cur_text_nodes = _get_nodes_from_buffer(cur_text_el_buffer, node_parser)
                nodes.extend(cur_text_nodes)
                cur_text_el_buffer = []

            index_node = IndexNode(
                text=str(element.summary), index_id=(element.id + "_table")
            )
            table_df = element.table
            table_str = table_df.to_string()
            node_mappings[(element.id + "_table")] = TextNode(text=table_str)
            other_mappings[(element.id + "_table")] = (
                element.table,
                str(element.summary),
            )
            nodes.append(index_node)
        else:
            cur_text_el_buffer.append(str(element.element))

    # flush text buffer
    if len(cur_text_el_buffer) > 0:
        cur_text_nodes = _get_nodes_from_buffer(cur_text_el_buffer, node_parser)
        nodes.extend(cur_text_nodes)
        cur_text_el_buffer = []

    return nodes, node_mappings, other_mappings

In [22]:
nodes, node_mappings, other_mappings = get_nodes_and_mappings(elements)

In [23]:
print(other_mappings["id_1715_table"])

(                                                                               \
0                               Year Ended December 31,                         
1                         2021                                         2020     
2    United States           $                           23,973                 
3            China                                       13,844                 
4            Other                                       16,006                 
5            Total           $                           53,823                 

                                                 
0                                                
1                 2019                           
2  $    15,207                $    12,653        
3        6,662                      2,979        
4        9,667                      8,946        
5  $    31,536                $    24,578        , 'Revenue by country for the years 2019, 2020, and 2021')


### Construct Retrievers

In [24]:
from llama_index.retrievers import RecursiveRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index import VectorStoreIndex

In [25]:
# construct top-level vector index + query engine
vector_index = VectorStoreIndex(nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=1)
vector_query_engine = vector_index.as_query_engine(similarity_top_k=1)

In [26]:
from llama_index.retrievers import RecursiveRetriever

recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    node_dict=node_mappings,
    verbose=True,
)
query_engine = RetrieverQueryEngine.from_args(recursive_retriever)

### Run some Queries

In [27]:
response = query_engine.query("What was the revenue in 2020?")
print(str(response))

[1;3;34mRetrieving with query id None: What was the revenue in 2020?
[0m[1;3;38;5;200mRetrieved node with id, entering: id_1715_table
[0m[1;3;34mRetrieving with query id id_1715_table: What was the revenue in 2020?
[0mThe revenue in 2020 was $31,536.


In [28]:
# compare against the baseline retriever
response = vector_query_engine.query("What was the revenue in 2020?")
print(str(response))

The revenue in 2020 was not provided in the given context information.


In [29]:
response = query_engine.query("What were the total cash flows in 2021?")

[1;3;34mRetrieving with query id None: What were the total cash flows in 2021?
[0m[1;3;38;5;200mRetrieved node with id, entering: id_558_table
[0m[1;3;34mRetrieving with query id id_558_table: What were the total cash flows in 2021?
[0m

In [30]:
print(str(response))

The total cash flows in 2021 were $11,497 million.


In [31]:
response = vector_query_engine.query("What were the total cash flows in 2021?")
print(str(response))

The total cash flows in 2021 were not provided in the given context information.


In [None]:
response = query_engine.query("What are the risk factors for Tesla?")
print(str(response))

In [51]:
response = vector_query_engine.query("What are the risk factors for Tesla?")
print(str(response))

The risk factors for Tesla include strong competition for skilled individuals in the labor market, negative publicity, potential impacts from reductions in force and departure of senior personnel, competition from companies with greater financial resources, dependence on the services of Elon Musk, potential cyber-attacks or security incidents, and reliance on service providers who may be vulnerable to security breaches. These factors could harm Tesla's ability to retain and hire qualified personnel, disrupt its business, harm its reputation, result in legal and financial exposure, and cause other adverse consequences.


## Try Table Comparisons

In this setting we load in both the 2021 and 2020 10K filings, parse each into a hierarchy of tables/text objects, define a recursive retriever over each, and then compose both with a SubQuestionQueryEngine.

This allows us to execute document comparisons against both.

### Define E2E Recursive Retriever Function

In [119]:
import pickle
import os


def create_recursive_retriever_over_doc(file_name, elements_save_path=None):
    """Big function to go from document path -> recursive retriever."""
    ### Load Data
    # extract elements from document
    if elements_save_path is not None and os.path.exists(elements_save_path):
        elements = pickle.load(open(elements_save_path, "rb"))
    else:
        elements = extract_elements(file_name, table_filters=[filter_table])
        table_elements = get_table_elements(elements)
        text_elements = get_text_elements(elements)
        # extract summaries over table elements
        extract_table_summaries(table_elements)
        if elements_save_path is not None:
            pickle.dump(elements, open(elements_save_path, "wb"))

    # convert into nodes
    nodes, node_mappings, other_mappings = get_nodes_and_mappings(elements)

    ### Construct Retrievers
    # construct top-level vector index + query engine
    vector_index = VectorStoreIndex(nodes)
    vector_retriever = vector_index.as_retriever(similarity_top_k=2)
    # vector_query_engine = vector_index.as_query_engine(similarity_top_k=1)
    recursive_retriever = RecursiveRetriever(
        "vector",
        retriever_dict={"vector": vector_retriever},
        node_dict=node_mappings,
        verbose=True,
    )
    query_engine = RetrieverQueryEngine.from_args(recursive_retriever)
    return query_engine, nodes

### Create Sub Question Query Engine

In [120]:
import nest_asyncio

nest_asyncio.apply()

In [121]:
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine

In [122]:
query_engine_2021, nodes_2021 = create_recursive_retriever_over_doc(
    "tesla_2021_10k.htm", elements_save_path="2021_elements.pkl"
)
query_engine_2020, nodes_2020 = create_recursive_retriever_over_doc(
    "tesla_2020_10k.htm", elements_save_path="2020_elements.pkl"
)

In [123]:
# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine_2021,
        metadata=ToolMetadata(
            name="tesla_2021_10k",
            description="Provides information about Tesla financials for year 2021",
        ),
    ),
    QueryEngineTool(
        query_engine=query_engine_2020,
        metadata=ToolMetadata(
            name="tesla_2020_10k",
            description="Provides information about Tesla financials for year 2020",
        ),
    ),
]

sub_query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context,
    use_async=True,
)

### Try out some Comparisons

In [None]:
response = sub_query_engine.query(
    "Can you compare and contrast the cash flow in 2021 with 2020?"
)

In [127]:
print(str(response))

In 2021, Tesla's cash flow was more detailed, with $11.5 billion provided by operating activities, $7.9 billion used in investing activities, and $5.2 billion used in financing activities. In contrast, in 2020, the cash flow was reported as a single figure of $5.94 billion. It's clear that the cash flow in 2021 was higher than in 2020, but the 2021 data provides a more granular view of how the cash flow was distributed across different activities.


In [None]:
response = sub_query_engine.query(
    "Can you compare and contrast the R&D expenditures in 2021 vs. 2020?"
)

In [None]:
print(str(response))

In [None]:
response = sub_query_engine.query(
    "Can you compare and contrast the risk factors in 2021 vs. 2020?"
)
print(str(response))

#### Try Comparing against Baseline

In [128]:
vector_index_2021 = VectorStoreIndex(nodes_2021)
vector_query_engine_2021 = vector_index_2021.as_query_engine(similarity_top_k=2)
vector_index_2020 = VectorStoreIndex(nodes_2020)
vector_query_engine_2020 = vector_index_2020.as_query_engine(similarity_top_k=2)
# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=vector_query_engine_2021,
        metadata=ToolMetadata(
            name="tesla_2021_10k",
            description="Provides information about Tesla financials for year 2021",
        ),
    ),
    QueryEngineTool(
        query_engine=vector_query_engine_2020,
        metadata=ToolMetadata(
            name="tesla_2020_10k",
            description="Provides information about Tesla financials for year 2020",
        ),
    ),
]

base_sub_query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context,
    use_async=True,
)

In [130]:
response = base_sub_query_engine.query(
    "Can you compare and contrast the cash flow in 2021 with 2020?"
)
print(str(response))

Generated 2 sub questions.
[1;3;38;2;237;90;200m[tesla_2021_10k] Q: What was the cash flow of Tesla in 2021?
[0m[1;3;38;2;90;149;237m[tesla_2020_10k] Q: What was the cash flow of Tesla in 2020?
[0m[1;3;38;2;237;90;200m[tesla_2021_10k] A: The cash flow of Tesla in 2021 was not directly mentioned in the provided context information.
[0m[1;3;38;2;90;149;237m[tesla_2020_10k] A: The cash flow of Tesla in 2020 was $5.94 billion.
[0mThe cash flow of Tesla in 2020 was $5.94 billion. However, there is no information available about the cash flow of Tesla in 2021.
