In [1]:
!pip install sec-api edgartools transformers faiss-cpu langchain openai


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [4]:
import os
from sec_api import QueryApi, PdfGeneratorApi
from edgar import set_identity, Company
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI



In [3]:
!pip install -U langchain langchain-community openai faiss-cpu transformers sec-api edgartools


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting openai
  Downloading openai-1.99.1-py3-none-any.whl.metadata (29 kB)
Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.meta

In [2]:
import os


In [4]:
!pip install -U langchain langchain-community langchain-openai openai faiss-cpu transformers sec-api edgartools


Collecting langchain-openai
  Downloading langchain_openai-0.3.28-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain_openai-0.3.28-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.3.28


In [5]:
# ✅ STEP 2: Imports and API key setup

# Core Python
import os

# SEC API
from sec_api import QueryApi, PdfGeneratorApi

# Edgar tools for direct SEC filing access
from edgar import Company, set_identity

# Transformers for chunking
from transformers import AutoTokenizer

# LangChain (modularized)
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI  # For LLM (GPT-4/GPT-3.5)

# ✅ API Keys
os.environ["SEC_API_KEY"] = "b9f76665f53682b7da6bff4cb468be38705151d832310eb94f9a86deb3a6bd11"
api_key = os.environ["SEC_API_KEY"]

# ✅ Email identity for edgartools (SEC.gov requires this)
set_identity("Your Name <ranismriti435@gmail.com>")


In [6]:
query_api = QueryApi(api_key=api_key)

def fetch_filing_urls(form_type, start_date, end_date, ticker=None):
    query_parts = [f'formType:"{form_type}"', f'filedAt:[{start_date} TO {end_date}]']
    if ticker:
        query_parts.append(f'ticker:"{ticker}"')

    query = " AND ".join(query_parts)
    filings = []
    from_idx = 0
    size = 50

    while True:
        params = {"query": query, "from": str(from_idx), "size": str(size)}
        resp = query_api.get_filings(params)
        batch = resp.get("filings", [])
        if not batch:
            break
        filings.extend(batch)
        from_idx += size

    return [f["linkToFilingDetails"] for f in filings]


In [7]:
urls = fetch_filing_urls("10-K", "2022-01-01", "2023-12-31", ticker="AAPL")
print("Filing URLs:", urls[:3])  # Show top 3


Filing URLs: ['https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm', 'https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/aapl-20220924.htm']


In [10]:
# ✅ STEP 5 (Corrected): Get text from latest 10-K

c = Company("AAPL")
filing = c.latest("10-K")

# Use .text() instead of .view()
text = filing.text()

# Check if text was retrieved
if text:
    print(text[:2000])  # Preview
else:
    print("❌ Filing text could not be loaded.")


                                                                                                                   
                                                   UNITED STATES                                                   
                                                                                                                   
                                                                                                                   
                                        SECURITIES AND EXCHANGE COMMISSION                                         
                                                                                                                   
                                                                                                                   
                                              Washington, D.C. 20549                                               
                                                                        

In [14]:
# Get latest 10-K filing for Apple
from edgar import Company

c = Company("AAPL")
filing = c.latest("10-K")

# This returns a list of section texts
sections = filing.sections()

# Loop and print first 500 characters of each section
for idx, section_text in enumerate(sections):
    print(f"Section {idx + 1}")
    print(section_text[:500])  # Show start of section
    print("-" * 80)


Section 1
aapl-20240928
--------------------------------------------------------------------------------
Section 2

--------------------------------------------------------------------------------
Section 3
UNITED STATES
--------------------------------------------------------------------------------
Section 4
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
--------------------------------------------------------------------------------
Section 5

--------------------------------------------------------------------------------
Section 6
FORM 10-K
--------------------------------------------------------------------------------
Section 7

--------------------------------------------------------------------------------
Section 8
(Mark One)
--------------------------------------------------------------------------------
Section 9
☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended September 28, 2024
or
-----------------

In [15]:
# Search for section containing 'Risk Factors'
risk_section = next((s for s in sections if "risk factors" in s.lower()), None)

if risk_section:
    print("📘 Found Risk Factors section:")
    print(risk_section[:1500])
else:
    print("❌ Risk Factors section not found.")


📘 Found Risk Factors section:
Page
Part I                                                                                                                                       
Item 1.        Business                                                                                                                     1
Item 1A.       Risk Factors                                                                                                                 5
Item 1B.       Unresolved Staff Comments                                                                                                   17
Item 1C.       Cybersecurity                                                                                                               17
Item 2.        Properties                                                                                                                  18
Item 3.        Legal Proceedings                                                                                 

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
max_tokens = 512

chunks = []

# Suppose paragraphs is a list that may contain non-strings
paragraphs = [...]  # your extracted paragraphs

for para in paragraphs:
    # Check the type first:
    if not isinstance(para, (str, list)):
        print(f"Skipping non-string paragraph of type {type(para)}")
        continue

    # If para is list of strings, join into one string
    if isinstance(para, list):
        para = " ".join(para)

    # Now para is a string, safe to tokenize
    toks = tokenizer(para, return_length=True, truncation=False)
    length = toks["length"]  # int

    if length <= max_tokens:
        chunks.append({"text": para, "meta": {"ticker": "AAPL", "form": "10-K", "year": "2022"}})
    else:
        # Handle longer text by further splitting, if needed
        pass


Skipping non-string paragraph of type <class 'ellipsis'>


In [24]:
# Example raw_text (from your SEC filing scraping/parsing)
raw_text = """Your SEC filing raw text goes here..."""

# Simple paragraph splitting (split by double newlines or other delimiters)
paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]

print(f"Total paragraphs extracted: {len(paragraphs)}")

# Chunking paragraphs with metadata
chunks = []
max_chunk_size = 500  # characters or tokens, adjust as needed

for para in paragraphs:
    if len(para) <= max_chunk_size:
        chunks.append({
            "text": para,
            "meta": {
                "ticker": "AAPL",
                "form": "10-K",
                "year": "2022"
            }
        })
    else:
        # Further split long paragraphs if needed (simple split by sentences or fixed window)
        sentences = para.split('. ')
        temp_chunk = ""
        for sentence in sentences:
            if len(temp_chunk) + len(sentence) < max_chunk_size:
                temp_chunk += sentence + ". "
            else:
                chunks.append({
                    "text": temp_chunk.strip(),
                    "meta": {
                        "ticker": "AAPL",
                        "form": "10-K",
                        "year": "2022"
                    }
                })
                temp_chunk = sentence + ". "
        if temp_chunk:
            chunks.append({
                "text": temp_chunk.strip(),
                "meta": {
                    "ticker": "AAPL",
                    "form": "10-K",
                    "year": "2022"
                }
            })

print(f"Total chunks created: {len(chunks)}")


Total paragraphs extracted: 1
Total chunks created: 1


In [28]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [29]:
from transformers import AutoTokenizer, AutoModel

model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
print("Model downloaded and cached.")


Model downloaded and cached.


In [31]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

if not chunks:
    raise ValueError("Chunks list is empty! Cannot create embeddings.")

# Try a faster or cached model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")

texts = [chunk["text"] for chunk in chunks]
metadatas = [chunk["meta"] for chunk in chunks]

print(f"Number of chunks to embed: {len(texts)}")

embeddings = embedding_model.embed_documents(texts)
print(f"Generated {len(embeddings)} embeddings.")

text_embedding_pairs = list(zip(texts, embeddings))

vectorstore = FAISS.from_embeddings(text_embedding_pairs, embedding_model, metadatas=metadatas)

print("FAISS vectorstore created successfully!")


Number of chunks to embed: 1
Generated 1 embeddings.
FAISS vectorstore created successfully!


In [35]:
import os
os.environ["OPENAI_API_KEY"] = "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"  # your actual key here


In [36]:
import os
print(os.getenv("OPENAI_API_KEY"))


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


In [39]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})


In [40]:
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA

llm = OpenAI(temperature=0)
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})  # smaller k to reduce calls

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

question = "What are Apple's main risk factors in its 2022 10-K filing?"
answer = qa_chain.invoke({"query": question})

print("Answer:", answer)


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [41]:
#The RateLimitError 429 means your OpenAI API quota is exhausted or your usage limit is reached. This is not a code error but a usage/billing issue.