In [306]:
import re
import time
import requests
from bs4 import BeautifulSoup
import pdfkit
import sqlite3
import os 
from datetime import datetime
from sec_cik_mapper import StockMapper


def get_all_filings(soup):
    #Go through all filings with html text and accerssion
    filings = []
    for row in soup.findAll("tr"):
        if "Accession Number" in row.text:
            filings.append(row)
            
    return filings


def get_acc_no(filing):
    text = filing.findAll('td')[2].text
    
    # Extract the accession number using regex
    match = re.search(r"Accession Number: (\d{10}-\d{2}-\d{6})", text)
    if match:
        accession_number = match.group(1)
        return (accession_number)


def get_filing_time(filing):
    time_data = filing.findAll('td')[3]
    date = time_data.contents[0]
    time = time_data.contents[2]
    
    datetime_obj = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S')
    unix_time = int(datetime_obj.timestamp())
    
    return unix_time


def get_filing_detail_link(filing):
    for links in filing_rows[0].findAll('a'):
        href = links['href']

        if ".htm" in href:
            return r"http://sec.gov" + href

        
def get_filing_file_links(filing_div, main_file_type):
    links = {
        "main": "",
        "supporting": []
    }

    for row in filing_div.find('table').findAll('tr'):
        if row.find('th') == None:
            try:

                file_type = row.findAll('td')[3].text
                if file_type == '\xa0':
                    continue

                link = "http://sec.gov" + row.find('a')['href']
                
                if file_type == main_file_type and links["main"] == "":
                    links["main"] = link

                if file_type != main_file_type:
                    links["supporting"].append(link)
                    
                    

            except:
                continue
                
    return links 
        

def get_filers_data(filers_div):
    filers_info = []
    
    for filer in filers_div.findAll('div', id='filerDiv'):
        filer_info = filer.find("span", "companyName").text.lower()

        filer_cik = re.search(r"cik:\s+(\d+)", filer_info).group(1)
        filer_name = re.search(r'^(.*?)\s*\(', filer_info).group(1)

        filer_type = re.findall(r'\((reporting|issuer|issuer|filer)\)', filer_info)[0]

        try:
            ticker = list(mapper.cik_to_tickers[filer_cik])[0]

        except KeyError as e:
            ticker = "FUND"


        filer_info = {
            "cik": filer_cik,
            "company_name": filer_name,
            "filer_ticker": ticker,
            "filer_type": filer_type,
        }
        
        filers_info.append(filer_info)


    return filers_info
        
        
        
def get_filing_metadata(filing_detail_link, filing_type):
    """returns meta data (report cik, and name) and links to files to download later"""
    filing_detail_request = requests.get(filing_detail_link, headers=headers).text
    filing_detail_soup = BeautifulSoup(filing_detail_request, "html.parser")
    
    #Location of filers info and filing pdf links 
    filing_detail_data = filing_detail_soup.find('div', id='contentDiv')
    
    #Extract the people/entites filing the form/report
    filers_data = get_filers_data(filing_detail_data)
    
    #Extract the links to the pdfs from the details page
    filing_forms = filing_detail_data.findAll('div', id='formDiv')[1] #where main file and supporting docs are
    filing_file_links = get_filing_file_links(filing_forms, filing_type)

    
    return filers_data, filing_file_links

        
        
        
def handle_form4(form4_filing):
    """form 4 has multiple ciks associated with it because
        and individual reports for a comapny so both the 
        individuals cik and the companys cik will be on it"""
    
    return



headers = {
"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}

latest_filings_url = \
r"https://www.sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=&owner=include&count=100&action=getcurrent"

max_unix = 0
mapper = StockMapper()

#while True starts jhere

latest_filings = requests.get(latest_filings_url, headers=headers).text
filings_soup = BeautifulSoup(latest_filings, "html.parser")

# each filing is an individual 'tr' tag with metadata like acc_no, file type, and unix
# this var stores all filing rows by finding if "accession number" is in the rows text
filing_rows = get_all_filings(filings_soup)

for filing in filing_rows:
    #converts the file timestamp (Y-m-d H:M:S) to unix for easy comparision
    #we will skip all files with less unix thnt max unix filing already found
    #because it means we have already scraped the file
    filing_unix_time = get_filing_time(filing)
    
    
    #means filing is older than most recently scrapped filing so it means weve already scraped this filing
    if max_unix >= filing_unix_time:
        continue
        
    #unique identifier for each filing (form 4s have same one for reporter and issuer)
    filing_acc_no = get_acc_no(filing)
    
    #Get filing type here since its easier that on filing detail page
    filing_type = filing.findAll('td')[0].text
    
    
    #each link to filing on latest filing brings up a page with the file and supporting documents and meta data
    #the filing_detail_link takes us to that page to download metadata and the files
    filing_detail_link = get_filing_detail_link(filing)
    
    
    metadata = {
        "unix": filing_unix_time,
        "filing_type": filing_type,
        "accession_no": filing_acc_no
    }
    
    
    #Could wrap all of the above but too lazy
    
    #Now request filing detail page where we can scrape metadata, supporting docs and the main filing
    metadata["filers"], metadata["filing_links"] = get_filing_metadata(filing_detail_link, filing_type)
    
    
    #Now save to database and download pdfs
    #edge cases: if we have reporter and issuer should we combine files and ignore the reporter (usually a person)
    
    break
    
    
    
    #for looking up one company we have all files stored in folders with acc no as folder names
    #in database we have those accessions tied to the cik of the company as issuer, reporter or something else
    

In [307]:
metadata

{'unix': 1686621468,
 'filing_type': '4',
 'accession_no': '0000902664-23-003537',
 'filers': [{'cik': '0001510589',
   'company_name': 'hillhouse investment management, ltd.',
   'filer_ticker': 'FUND',
   'filer_type': 'reporting'},
  {'cik': '0001651308',
   'company_name': 'beigene, ltd.',
   'filer_ticker': 'BGNE',
   'filer_type': 'issuer'},
  {'cik': '0001762304',
   'company_name': 'hhlr advisors, ltd.',
   'filer_ticker': 'FUND',
   'filer_type': 'reporting'}],
 'filing_links': {'main': 'http://sec.gov/Archives/edgar/data/1510589/000090266423003537/xslF345X04/ownership.xml',
  'supporting': []}}

In [270]:
def get_filing_file_links(filing_div, main_file_type):
    links = {
        "main": "",
        "supporting": []
    }

    for row in filing_div.find('table').findAll('tr'):
        if row.find('th') == None:
            try:

                file_type = row.findAll('td')[3].text
                if file_type == '\xa0':
                    continue

                link = "http://sec.gov" + row.find('a')['href']
                print(link)

                if file_type == main_file_type and links["main"] == "":
                    links["main"] = link

                if file_type != main_file_type:
                    links["supporting"].append(link)

            except:
                continue
                
        return links

In [269]:
links

{'main': 'http://sec.gov/Archives/edgar/data/1510589/000090266423003537/xslF345X04/ownership.xml',
 'supporting': []}

In [259]:
file_type

'\xa0'

In [96]:
pewee = get_filing_metadata(filing_detail_link)

In [110]:
pewee.findAll('div', id='filerDiv')[1]

<div id="filerDiv">
<div class="mailer">Mailing Address
      <span class="mailerAddress">C/O MOURANT GOVERNANCE SERVICES (CAYMAN)</span>
<span class="mailerAddress">94 SOLARIS AVENUE, CAMANA BAY</span>
<span class="mailerAddress">
GRAND CAYMAN E9 KY1-1108      </span>
</div>
<div class="mailer">Business Address
      <span class="mailerAddress">C/O MOURANT GOVERNANCE SERVICES (CAYMAN)</span>
<span class="mailerAddress">94 SOLARIS AVENUE, CAMANA BAY</span>
<span class="mailerAddress">
GRAND CAYMAN E9 KY1-1108      </span>
<span class="mailerAddress">13459494123</span>
</div>
<div class="companyInfo">
<span class="companyName">BeiGene, Ltd. (<a href="/cgi-bin/own-disp?CIK=0001651308&amp;action=getissuer">Issuer</a>)
 <acronym title="Central Index Key">CIK</acronym>: <a href="/cgi-bin/browse-edgar?CIK=0001651308&amp;action=getcompany">0001651308 (see all company filings)</a></span>
<p class="identInfo"><acronym title="Internal Revenue Service Number">IRS No.</acronym>: <strong>981209416<

In [None]:
class companyName

In [177]:
from sec_cik_mapper import StockMapper


def get_filers_data(filing):
    filers_info = []

    for filer in filing.findAll('div', id='filerDiv'):
        filer_info = filer.find("span", "companyName").text.lower()

        filer_cik = re.search(r"cik:\s+(\d+)", filer_info).group(1)
        filer_name = re.search(r'^(.*?)\s*\(', filer_info).group(1)

        filer_type = re.findall(r'\((reporting|issuer|issuer)\)', filer_info)

        try:
            ticker = list(mapper.cik_to_tickers[filer_cik])[0]

        except KeyError as e:
            ticker = "FUND"


        filer_info = {
            "cik": filer_cik,
            "company_name": filer_name,
            "filer_ticker": ticker,
            "filer_type": filer_type,
        }


    return filers_info

In [None]:
get_filers_data(filing)

In [164]:
re.search(r'^(.*?)\s*\(', 'HHLR ADVISORS, LTD. (Reporting)\n CIK: 0001762304 (see all company filings)').group(1)

'HHLR ADVISORS, LTD.'

In [132]:
filer_info

'hillhouse investment management, ltd. (reporting)\n cik: 0001510589 (see all company filings)'

In [123]:
filer.find("span", "companyName").text

'HHLR ADVISORS, LTD. (Reporting)\n CIK: 0001762304 (see all company filings)'

In [127]:
re.search(r"CIK:\s+(\d+)", filer.find("span", "companyName").text).group(1)

'0001762304'

In [130]:
filer.find("span", "companyName").text

'HHLR ADVISORS, LTD. (Reporting)\n CIK: 0001762304 (see all company filings)'

In [150]:
from sec_cik_mapper import StockMapper

In [159]:
StockMapper().cik_to_company_name["0001510589"]

KeyError: '0001510589'

In [147]:
from secedgar.cik_lookup import CIKLookup

lookups = CIKLookup(['0001510589'],
                    user_agent="Name (email)")

In [148]:
lookups.lookup_dict 

{'0001510589': '0001510589'}

In [None]:
#form 4 and 3s are filed seperatley for issuer and reproter
#however they share same accession number and files, just different names for who file
#So we can group as one file perhaps

In [81]:
metadata

{'unix': 1686621468,
 'filing_type': '4',
 'accession_no': '0000902664-23-003537'}