In [5]:
import numpy as np
from bs4 import BeautifulSoup, Comment
import requests
import re
import copy
import webscraper
header = {"User-Agent": "bzhang14@umd.edu"}

In [33]:
def json_submission_scraper(cik:str, form_type:str):
    
    """
    Scrape The JSON File That Has All The Information About Files That Have Been Submitted To EDGAR.
    """

    header = {"User-Agent": "bzhang14@umd.edu"}
    url_to_json = generate_json_submission_url(cik)

    req = requests.get(url_to_json, headers=header)
    submissions_json = req.json()

    recent_filings = submissions_json["filings"]["recent"]  # recent filings

    # the non-recent filings are not stored in the same JSON file so we need to
    # get those from another JSON file that is stored in the one that was just requested
    nonrecent_filings_name = submissions_json["filings"]["files"][0]["name"]
    nonrecent_filings_url = generate_json_submission_url(is_recent_submission=False,
                                                          name=nonrecent_filings_name)
    
    req = requests.get(nonrecent_filings_url, headers=header)
    non_recent_submissions_json = req.json()
    non_recent_filings = non_recent_submissions_json

    recent_filing_dates, recent_path_to_documents = find_forms(recent_filings, form_type)
    nonrecent_filing_dates, nonrecent_path_to_documents = find_forms(non_recent_filings, form_type)

    print(type(recent_path_to_documents))
    print(type(nonrecent_path_to_documents))

    path_to_documents = np.concatenate((recent_path_to_documents, nonrecent_path_to_documents))
    filing_dates = np.concatenate((recent_filing_dates, nonrecent_filing_dates))

    urls = generate_EDGAR_directory_listing_url(cik, path_to_documents)
    print(urls)

    return urls

def find_forms(filings: list[str], form_type:str):
    
    forms = np.array(filings["form"])
    accession_numbers = np.array([filing + "/" for filing in filings["accessionNumber"]])
    document_names = np.array(filings["primaryDocument"])
    filing_dates = np.array(filings["filingDate"])

    indexes = np.where(forms == form_type)
    form_accession_numbers = accession_numbers[indexes]
    form_document_names = document_names[indexes]

    form_accession_numbers = np.char.replace(form_accession_numbers, "-", "")
    path_to_documents = np.char.add(form_accession_numbers, form_document_names)

    return filing_dates, path_to_documents

    
def generate_json_submission_url(cik:str = "", is_recent_submission:bool = True, 
                                    name:str = None):
    
    base_url = "https://data.sec.gov/submissions/"
    
    cik_length = len(cik)
    num_leading_zeros = 10 - cik_length
    leading_zeros = num_leading_zeros * "0"
    leading_zeros_cik = leading_zeros + cik

    if is_recent_submission:                                    # the url is different for submisisons within 10 years
        url_to_json = f"{base_url}CIK{leading_zeros_cik}.json"  # and submissions over 10 years
    else:
        url_to_json = f"{base_url}/{name}"

    return url_to_json
    

def generate_EDGAR_directory_listing_url(cik: str, path_to_documents: np.ndarray):
    base_url = "https://www.sec.gov/Archives/edgar/data/"
    base_url_cik = f"{base_url}{cik}/"
    base_url_cik_list = np.array(base_url_cik)
    url_to_files = np.char.add(base_url_cik_list, path_to_documents)

    return url_to_files


In [34]:
test = json_submission_scraper("1397187", "10-K")

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
['https://www.sec.gov/Archives/edgar/data/1397187/000139718723000012/lulu-20230129.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718722000014/lulu-20220130.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718721000009/lulu-20210131.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718720000012/lulu-20200202x10k.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718719000011/lulu-20190203x10k.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718718000013/lulu-20180128x10k.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718717000008/lulu-20170129x10k.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718716000089/lulu-20160131x10k.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718715000016/lulu-20150201x10k.htm'
 'https://www.sec.gov/Archives/edgar/data/1397187/000139718714000021/lulu-20140202x10k.htm'
 'https://www.sec.gov/Archives/edgar/data/13

In [10]:
test = requests.get("https://data.sec.gov/submissions/CIK0001397187.json", headers=header)
json_object = test.json()
filings = json_object["filings"]
recent = filings["recent"]
files = filings["files"]
print(recent.keys())



dict_keys(['accessionNumber', 'filingDate', 'reportDate', 'acceptanceDateTime', 'act', 'form', 'fileNumber', 'filmNumber', 'items', 'size', 'isXBRL', 'isInlineXBRL', 'primaryDocument', 'primaryDocDescription'])


In [16]:
forms = np.array(recent["form"])
accessionNumber = recent["accessionNumber"]
list_of_indicies = np.where(forms == "10-K")[0]

for index in list_of_indicies:
    print(accessionNumber[index])
    

0001397187-23-000012
0001397187-22-000014
0001397187-21-000009
0001397187-20-000012
0001397187-19-000011
0001397187-18-000013
0001397187-17-000008
0001397187-16-000089
0001397187-15-000016
0001397187-14-000021
0001193125-13-118393
0001193125-12-126444
0000950123-11-026220
0000950123-10-028033


In [2]:
str = "0001397187-20-000014"
test = str.replace("-","")
print(test)

000139718720000014
