In [64]:
from tqdm.autonotebook import tqdm
import pinecone
import os
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
load_dotenv()
pinecone_key = os.environ["PINECONE_API_KEY"]
pinecone_env = os.environ["PINECONE_ENV"]
pinecone.init(api_key=pinecone_key, environment=pinecone_env)
index = pinecone.Index("edgar")
openai_key = os.environ["OPENAI_API_KEY"]
embeddings = OpenAIEmbeddings(openai_api_key=openai_key)
from langchain.vectorstores import Pinecone
vectorstore = Pinecone(
    index, embeddings.embed_query, 'text'
)
# vectorstore = Pinecone.from_existing_index(index, embeddings)
docs = vectorstore.similarity_search("how was tesla's revenue in 2023 Q1 compared with that in 2022 Q1?", k=2)
print(docs)

[Document(page_content='33\n\nAutomotive sales revenue increased $11.48 billion, or 65%, in the six months ended June 30, 2022 as compared to the six months ended June 30, 2021, primarily due to an increase of 156,279 Model 3 and Model Y cash deliveries, and an increase of 22,963 Model S and Model X cash deliveries year over year. This was achieved from production ramping of Model Y at Gigafactory Shanghai and the Fremont Factory as well as the start of production at Gigafactory Berlin-Brandenburg and Gigafactory Texas in 2022, at a higher combined average selling price from a higher proportion of Model Y sales offset by regional sales mix. There was also an increase in production and an increase in the average selling price of Model S and Model X with a higher proportion of Model X sales, compared to the prior period as deliveries of the new versions of Model S and Model X only began ramping in the second and fourth quarters of 2021, respectively.\n\nAutomotive regulatory credits reve

In [66]:
docs[1].metadata

{'form_type': '10-Q',
 'report_date': 20160331.0,
 'source': 'https://www.sec.gov/Archives/edgar/data/1318605/000156459016018886/tsla-10q_20160331.htm',
 'symbol': 'TSLA'}

In [28]:
import re

txt = """
<SEC-DOCUMENT>0001564590-22-026876.txt : 20220728
<SEC-HEADER>0001564590-22-026876.hdr.sgml : 20220728
<ACCEPTANCE-DATETIME>20220728160619
ACCESSION NUMBER: 0001564590-22-026876
CONFORMED SUBMISSION TYPE: 10-K
PUBLIC DOCUMENT COUNT: 134
CONFORMED PERIOD OF REPORT: 20220630
FILED AS OF DATE: 20220728
DATE AS OF CHANGE: 20220728
FILER:
COMPANY DATA:
COMPANY CONFORMED NAME: MICROSOFT CORP
CENTRAL INDEX KEY: 0000789019
STANDARD INDUSTRIAL CLASSIFICATION: SERVICES-PREPACKAGED SOFTWARE [7372]
IRS NUMBER: 911144442
STATE OF INCORPORATION: WA
FISCAL YEAR END: 0630
FILING VALUES:
FORM TYPE: 10-K
SEC ACT: 1934 Act
SEC FILE NUMBER: 001-37845
FILM NUMBER: 221115247
BUSINESS ADDRESS:
STREET 1: ONE MICROSOFT WAY
CITY: REDMOND
STATE: WA
ZIP: 98052-6399
BUSINESS PHONE: 425-882-8080
MAIL ADDRESS:
STREET 1: ONE MICROSOFT WAY
CITY: REDMOND
STATE: WA
ZIP: 98052-6399
</SEC-HEADER>
<DOCUMENT>
<TYPE>10-K
<SEQUENCE>1
<FILENAME>msft-10k_20220630.htm
<DESCRIPTION>10-K
<TEXT>
"""

# Search for CIK
cik_pattern = r"CENTRAL INDEX KEY:\s*0*(\d+)"
cik = re.search(cik_pattern, txt)
cik = cik.group(1) if cik else None

# Search for ACCESSION NUMBER
accession_pattern = r"ACCESSION NUMBER:\s*(0*[\d-]+)"
accession = re.search(accession_pattern, txt)
accession = accession.group(1).replace('-', '') if accession else None

# Search for FILENAME
filename_pattern = r"<FILENAME>([\w\d_-]+\.htm)"
filename = re.search(filename_pattern, txt)
filename = filename.group(1) if filename else None

updated_filename = f"{cik},{accession},{filename}"
print(updated_filename)

# Search for REPORT DATE
report_date_pattern = r"CONFORMED PERIOD OF REPORT:\s*(\d+)"
report_date = re.search(report_date_pattern, txt)
report_date = report_date.group(1) if report_date else None
print(report_date)


789019,000156459022026876,msft-10k_20220630.htm
20220630


In [29]:
from updaters.edgar_updater import process_reports

process_reports(["10-K","10-Q"], ["AAPL"], "2022-01-01", "./docs/history")



In [31]:
from tqdm.autonotebook import tqdm
import pinecone
import os
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
load_dotenv()
pinecone_key = os.environ["PINECONE_API_KEY"]
pinecone_env = os.environ["PINECONE_ENV"]
pinecone.init(api_key=pinecone_key, environment=pinecone_env)
index = pinecone.Index("edgar")
index.delete(deleteAll='true')

{}

In [53]:
import chromadb
from chromadb.config import Settings
# Example setup of the client to connect to your chroma server
client = chromadb.Client(Settings(chroma_api_impl="rest", chroma_server_host="54.241.40.222", chroma_server_http_port=8000))

In [59]:
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
load_dotenv()
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=os.environ["OPENAI_API_KEY"],
                model_name="text-embedding-ada-002"
            )
collection = client.create_collection(name="edgar", embedding_function=openai_ef, metadata={"hnsw:space": "cosine"})

In [60]:
collection = client.get_collection(name="edgar", embedding_function=openai_ef)

In [62]:
collection.count()
client.delete_collection(name="edgar")