In [11]:
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

In [12]:
financial_facts = pd.read_csv("../data/processed/financial_facts_clean.csv")

In [13]:
def financial_row_to_doc(row):
    content = (
        f"In fiscal year {row['fy']}, {row['entityName']} reported {row['companyFact']} "
        f"of {row['val']} {row['units']}, as disclosed in its {row['form']} filing."
    )
    metadata = {
        "cik": row["cik"],
        "company": row["entityName"],
        "line_item": row["companyFact"],
        "fiscal_year": row["fy"],
        "form": row["form"],
        "filing_date": row["filed"]
    }
    return Document(page_content=content, metadata=metadata)

documents = [financial_row_to_doc(row) for _, row in financial_facts.iterrows()]

In [14]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents, embeddings)

In [15]:
def retrieve_docs(vectorstore, query, company=None, fiscal_year=None, k=5):
    docs = vectorstore.similarity_search(query, k=k)
    if company:
        docs = [d for d in docs if d.metadata["company"] == company]
    if fiscal_year:
        docs = [d for d in docs if d.metadata["fiscal_year"] == fiscal_year]
    return docs

In [16]:
query = "What were Morgan Stanley's total assets in 2011?"
company = "MORGAN STANLEY"
fiscal_year = 2011

retrieved_docs = retrieve_docs(vectorstore, query, company=company, fiscal_year=fiscal_year, k=5)

for d in retrieved_docs:
    print(d.page_content)

In fiscal year 2011, MORGAN STANLEY reported Assets of 749898000000 USD, as disclosed in its 10-K filing.
In fiscal year 2011, MORGAN STANLEY reported Assets of 807698000000 USD, as disclosed in its 10-K filing.
