# Step 1: Install Required Libraries

In [None]:
!pip install -q sec-edgar-downloader sentence-transformers faiss-cpu langchain gradio requests

# Step 2: Import Dependencies

In [21]:

from sec_edgar_downloader import Downloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gradio as gr
import requests
import json
from openai import OpenAI
from google.colab import userdata

# Step 3: Get OPENAI API KEY

In [37]:
# The client will auto-read the from google collab secrets
client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

# Step 4: Create Helper Functions

In [114]:
# Download SEC Filings
def download_sec_filings(comp):
    dl = Downloader("My Company", "your-email@example.com")
    dl.get("10-K", comp, limit=2)  # Apple filings
    return [open(f"sec-edgar-filings/AAPL/10-K/0000320193-23-000106/full-submission.txt").read()]

# Preprocess Text into Chunks
def chunk_text(docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " "]
    )
    return text_splitter.split_text(docs[0])

# Create Embeddings & Vector DB
def create_vector_db(texts):
    model = SentenceTransformer('all-mpnet-base-v2')
    embeddings = model.encode(texts)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index, model

# QA System with DeepSeek with RAG
# def ask_question_deepseek(question, index, texts):
#     # Retrieve relevant chunks
#     query_embedding = model.encode([question])[0]
#     _, indices = index.search(np.array([query_embedding]), 3)
#     context = "\n".join([texts[i] for i in indices[0]])

#     # Construct DeepSeek API prompt
#     messages = [{
#         "role": "user",
#         "content": f"Answer this question based on SEC filings context:\n\nContext: {context}\n\nQuestion: {question}"
#     }]

#     # Call DeepSeek API
#     data = {
#         "model": "deepseek-chat",
#         "messages": messages,
#         "temperature": 0.3,
#         "max_tokens": 150
#     }

#     response = requests.post(DEEPSEEK_API_URL, headers=HEADERS, json=data)
#     if response.status_code == 200:
#         return response.json()["choices"][0]["message"]["content"]
#     else:
#         return f"Error: {response.text}"


# ========== QA SYSTEM with openAI ==========
def ask_question_openai(question, max_tokens):
    """Retrieve context and generate answer"""
    # Encode question
    query_embedding = model.encode([question])[0]

    # Generate answer
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"Question: {question}"
            }],
            max_tokens=max_tokens
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"API Error: {str(e)}"


# ========== QA SYSTEM with openAI with Rag ==========
def ask_question_openai_rag(question, max_tokens, index, texts, top_n=3):
    """Retrieve context and generate answer"""
    # Encode question
    query_embedding = model.encode([question])[0]

    # Retrieve top 3 chunks
    _, indices = index.search(np.array([query_embedding]), top_n)
    context = "\n".join([texts[i] for i in indices[0]])

    # Generate answer
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"Answer based on SEC context:\nContext: {context}\nQuestion: {question}"
            }],
            max_tokens=max_tokens
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"API Error: {str(e)}"


# Step 5: Download SEC Filings
- For this project I chose Apple

In [115]:
sec_docs = download_sec_filings("AAPL")

# Step 6: Split text into chunks
- Useful to retrieve the most relevant documents based on similarity using RAG
- Pirticularly useful if you are dealing pages worth of documents

In [116]:
text_chunks = chunk_text(sec_docs)

In [10]:
print(text_chunks[0])

<SEC-DOCUMENT>0000320193-23-000106.txt : 20231103
<SEC-HEADER>0000320193-23-000106.hdr.sgml : 20231103
<ACCEPTANCE-DATETIME>20231102180827
ACCESSION NUMBER:		0000320193-23-000106
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		96
CONFORMED PERIOD OF REPORT:	20230930
FILED AS OF DATE:		20231103
DATE AS OF CHANGE:		20231102

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			Apple Inc.
		CENTRAL INDEX KEY:			0000320193
		STANDARD INDUSTRIAL CLASSIFICATION:	ELECTRONIC COMPUTERS [3571]
		IRS NUMBER:				942404110
		STATE OF INCORPORATION:			CA
		FISCAL YEAR END:			0930

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-36743
		FILM NUMBER:		231373899

	BUSINESS ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014
		BUSINESS PHONE:		(408) 996-1010

	MAIL ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014


# Step 7: Creating a Vector Database

In [11]:

faiss_index, model = create_vector_db(text_chunks)


🔧 Creating vector database...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Step 8: Builiding Pipeline Functions

In [112]:
#### Step 8 build Pipeline Functions

def test_qa_pipeline(max_tokens=1000, rag=False):
    test_questions = [
        "What was Apple's total revenue in 2023?",
        "What are the main risk factors mentioned?",
        "Who is Apple's CEO?",
        "What does the company say about iPhone sales?",
        "How much cash reserves does Apple have?"
    ]

    print("\n🔍 Running Test Cases...")
    for i, question in enumerate(test_questions, 1):
        print(f"\nTest Case {i}: {question}")

        if rag:
            # RAG approach
            try:
              answer = ask_question_openai_rag(question, max_tokens, faiss_index, text_chunks)
              print(f"Answer: {answer}\n")
              print("-" * 80)
            except Exception as e:
              print(f"Error: {str(e)}")


        else:
            # Default
            try:
              answer = ask_question_openai(question, max_tokens)
              print(f"Answer: {answer}\n")
              print("-" * 80)
            except Exception as e:
              print(f"Error: {str(e)}")



def test_qa_user_input(question, max_tokens=1000, top_n=3, rag=False, return_ans=False):

    if rag:
        # RAG approach
        try:
          answer = ask_question_openai_rag(question, max_tokens, faiss_index, text_chunks, top_n)
          print(f"Answer: {answer}\n")
          print("-" * 80)
        except Exception as e:
          print(f"Error: {str(e)}")


    else:
        # Default
        try:
          answer = ask_question_openai(question, max_tokens)
          print(f"Answer: {answer}\n")
          print("-" * 80)
        except Exception as e:
          print(f"Error: {str(e)}")


    if return_ans:
      return answer


# Step 9: Testing of Sample Questions

## Without Context from RAG

In [53]:
test_qa_pipeline()


🔍 Running Test Cases...

Test Case 1: What was Apple's total revenue in 2023?
Answer: I'm sorry, but I don't have access to the most recent financial data, including Apple's total revenue for 2023. I would recommend checking the latest financial statements on Apple's investor relations website or reputable financial news sources for this information.

--------------------------------------------------------------------------------

Test Case 2: What are the main risk factors mentioned?
Answer: I'm sorry, but your question is vague and does not specify the context or subject for which you are asking about "main risk factors." Could you please provide more details or specify the context, such as a particular disease, investment, environmental situation, or any other area of interest?

--------------------------------------------------------------------------------

Test Case 3: Who is Apple's CEO?
Answer: As of October 2023, Apple's CEO is Tim Cook.

------------------------------------

## With Context from RAG

Top 3 text chunks

In [51]:
# Run automated tests
test_qa_pipeline(rag=True)


🔍 Running Test Cases...

Test Case 1: What was Apple's total revenue in 2023?
Answer: The context provided does not include the specific total revenue figure for Apple in 2023. It discusses various aspects of Apple's financial performance, such as changes in sales of different product lines and deferred revenue figures, but it does not state the total revenue for the year 2023. To find the exact total revenue for 2023, you would need to refer to Apple's complete financial statements or annual report for that year.

--------------------------------------------------------------------------------

Test Case 2: What are the main risk factors mentioned?
Answer: The main risk factors mentioned in the context are:

1. **Industrial Accidents**: These can lead to serious injuries, loss of life, disruption of the company's business, and harm to its reputation.

2. **Major Public Health Issues**: Pandemics, such as COVID-19, can adversely affect the company due to their impact on the global eco

# Step 10: Testing on User Question

## Without Context from RAG

In [89]:
test_qa_user_input('Can you summarize the SEC fillings for APPLE in the year 2023')

Answer: As of my last update, I can't provide real-time or detailed summaries of specific SEC filings for companies like Apple in 2023. However, I can guide you on how to access and interpret these filings.

1. **Accessing SEC Filings:**
   - Visit the [SEC's EDGAR database](https://www.sec.gov/edgar/searchedgar/companysearch.html).
   - Enter "Apple Inc." or its ticker symbol "AAPL" in the company search field.
   - You will be presented with a list of Apple's filings, including annual reports (10-K), quarterly reports (10-Q), and other relevant documents.

2. **Types of Key Filings:**
   - **10-K:** The annual report that provides a comprehensive overview of the company's financial performance.
   - **10-Q:** Quarterly reports that include unaudited financial statements and provides a continuing view of the company's financial position.
   - **8-K:** Current reports filed to announce major events that shareholders should know about.
   - **Proxy Statements (DEF 14A):** Information re

In [91]:
test_qa_user_input('How did APPLE perform in the year 2023?')

Answer: I'm unable to provide real-time data or details about Apple's performance in 2023, as my information is up-to-date only until October 2023 and I don't have access to real-time databases. For the most current and accurate information regarding Apple's performance in that year, including financial reports, stock performance, and other relevant metrics, it's best to consult financial news websites, the company's official investor relations page, or market analysis platforms.

--------------------------------------------------------------------------------


## Wit Context from RAG

In [93]:
test_qa_user_input('Can you summarize the SEC fillings for APPLE in the year 2023', rag=True)

Answer: The SEC filings for Apple Inc. for the year 2023 indicate that the company provided comprehensive financial statements, including balance sheets as of September 30, 2023, and September 24, 2022. These filings also included consolidated statements of operations, comprehensive income, shareholders' equity, and cash flows for each of the three years ending September 30, 2023. 

The financial statements are stated to fairly present, in all material respects, the financial position of Apple Inc. for the dates mentioned, and the results of its operations and cash flows, in line with U.S. generally accepted accounting principles (GAAP). The filings confirm compliance with the Sarbanes-Oxley Act of 2002 and ensure the information in the Annual Report on Form 10-K for the fiscal year ending September 30, 2023, aligns with Sections 13(a) or 15(d) of the Securities Exchange Act of 1934.

Additionally, Apple's fiscal year consists of 52 or 53 weeks, ending on the last Saturday of September

In [92]:
test_qa_user_input('How did APPLE perform in the year 2023?', rag=True)

Answer: In the year 2023, Apple experienced a mixed financial performance:

1. **Total Net Sales:** The company's total net sales decreased by 3% or $11.0 billion compared to 2022. This decline was primarily due to lower net sales of Mac and iPhone models, although it was somewhat offset by increased sales of Pro iPhone models and higher net sales in Services.

2. **iPhone Sales:** There was a decline in net sales of non-Pro iPhone models, but this was partially offset by higher net sales of Pro iPhone models, resulting in a net sales decrease of $4.9 billion in this category.

3. **Mac Sales:** Mac net sales saw a significant decrease of 27%, amounting to a $10.8 billion reduction, primarily due to lower sales of laptops.

4. **Currency Impact:** The overall decrease in net sales was affected by the weakness of foreign currencies relative to the U.S. dollar, which contributed more than the entire year-over-year drop.

5. **Services:** While specific figures weren't provided, it was no

# Step 11 : Changing Generating Parameters

In [94]:
test_qa_user_input('Can you summarize the SEC fillings for APPLE in the year 2023', max_tokens=1000, top_n=10, rag=True)

Answer: In 2023, Apple Inc. filed an Annual Report on Form 10-K for the fiscal year ending September 30, 2023. This report was filed with the SEC on November 3, 2023, following the requirements of the Securities Exchange Act of 1934. The document includes Apple's consolidated financial statements for the periods ended September 30, 2023, and September 24, 2022. These filings contain balance sheets, statements of operations, comprehensive income, shareholders' equity, cash flows, and the accompanying notes.

The audit of Apple's financial statements and internal controls over financial reporting was conducted in accordance with the standards of the Public Company Accounting Oversight Board (PCAOB). Ernst & Young LLP, the registered public accounting firm, expressed an unqualified opinion, stating that the financial statements present fairly, in all material respects, Apple's financial position, results of operations, and cash flows in conformity with U.S. generally accepted accounting p

In [84]:
test_qa_user_input('How did APPLE perform in the year 2023? can you give some solid financial numbers in bullet points', max_tokens=1000, top_n=10, rag=True)

Answer: Based on the provided context, here are some of the key financial highlights for Apple Inc. for the year 2023:

- Total net sales decreased by 3% or $11.0 billion compared to 2022. 
- iPhone net sales decline is primarily attributed to lower net sales of non-Pro iPhone models, partially offset by higher sales of Pro models.
- Mac net sales decreased by 27%, amounting to a $10.8 billion decline, primarily due to lower laptop sales.
- Currency exchange rate fluctuations accounted for more than the entire year-over-year decrease in total net sales.
- As of September 30, 2023, the Company reported $12.1 billion in total deferred revenue, with expectations of realization as follows:
  - 67% in less than a year.
  - 25% within one to two years.
  - 7% within two to three years.
  - 1% in more than three years.

These financial figures indicate some declines in specific areas such as Mac and iPhone sales, with some challenges posed by unfavorable currency exchange rates.

------------