In [30]:
import re
import time
import requests
from bs4 import BeautifulSoup
import openai
import pdfkit
import sqlite3
import os 
from datetime import datetime

def get_all_filings(soup):
    #Go through all filings with html text and accerssion
    filings = []
    for row in soup.findAll("tr"):
        if "Accession Number" in row.text:
            filings.append(row)
            
    return filings


def get_acc_no(text):
    # Extract the accession number using regex
    match = re.search(r"Accession Number: (\d{10}-\d{2}-\d{6})", text)
    if match:
        accession_number = match.group(1)
        return (accession_number)
    
    
def get_filing_metadata(filing):
    for links in filing.findAll('a'):
        href = links['href']

        if ".htm" in href:
            #"click" on link (just request that link)
            x = requests.get(r"http://sec.gov" + href, headers=headers)

            
    soup = BeautifulSoup(x.text, "html.parser")
    cik = re.search(r"CIK:\s+(\d+)", soup.text).group(1)
    
    for _ in soup.findAll('a'):
        if "ix?doc=" in _['href']:
            partial_link = _['href'].split("/ix?doc=")[-1]

            filing_link = "http://sec.gov" + partial_link
            
            return filing_link, cik
        
        
def get_filing_time(filing):
    time_data = filing.findAll('td')[3]
    date = time_data.contents[0]
    time = time_data.contents[2]
    
    datetime_obj = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S')
    unix_time = int(datetime_obj.timestamp())
    
    return unix_time
        
    
def get_filing(filing_link):
    raw_filing = BeautifulSoup(requests.get(filing_link, headers=headers).text, "html.parser").find("body").text
    filing = clean_filing(raw_filing)
    
    return filing
        
        
def clean_filing(raw_filing):
    filing = raw_filing.replace("\n", " ").replace("\xa0", " ").strip()
    filing = " ".join(filing.split())


    filing = "UNITED STATES SECURITIES AND EXCHANGE COMMISSION" + \
            filing.split("UNITED STATES SECURITIES AND EXCHANGE COMMISSION")[-1]
    
    return filing.lower()


def is_filing_merger(filing_text):
    #Need to make more solid determination
    if "merger" not in filing_text:
        return False
    
    if "item 1.01".lower() in filing_text:
        return True
    
    if "item 7.01".lower() in filing_text:
        return True
    
    return False

    
headers = {
"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}

base_url = r"https://www.sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=8-K&owner=include&count=100&action=getcurrent"


current_dir = os.getcwd() + "\\" 


path_wkhtmltopdf = current_dir.split("scraper")[0] + "wkhtmltopdf\\bin\\wkhtmltopdf.exe"
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)

DB_PATH = current_dir.split("scraper")[0] + "database\\filing_data.sqlite3"

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

latest_8k_filings = requests.get(base_url, headers=headers).text
soup = BeautifulSoup(latest_8k_filings, "html.parser")

#Iterate through all filings on page
filings = get_all_filings(soup)

x = time.time()


cursor.execute("SELECT accession_no, unix_number FROM seen_filings")
seen = cursor.fetchall()
all_accession_numbers = {row[0] for row in seen}
max_unix_number = max({row[1] for row in seen})

for filing in filings:
    filing_acc_no = get_acc_no(filing.findAll('td')[2].text)
    
    if filing_acc_no in all_accession_numbers:
        continue
    
    filing_link, company_cik = get_filing_metadata(filing)
    filing_time = get_filing_time(filing)
    
    
    if max_unix_number > filing_time:
        continue
    
    
    filing_text = get_filing(filing_link)
    
    if is_filing_merger(filing_text.lower()):
        print(True, filing_link)

        
        try:
            #Store metadata to db
            cursor.execute("INSERT INTO data (accession_no,cik, unix_number) VALUES (?, ?, ?)",
                           (filing_acc_no, company_cik, filing_time))

            # Commit the changes to the database
            conn.commit()
            
        except:
            continue
        
        
        #Save as pdf to 8k folder
        filings_path = current_dir.split("scraper")[0] + "8ks"
        filing_path = filings_path + filing_acc_no + ".pdf"
        pdfkit.from_url(filing_link, filing_path, configuration=config)
        
        
    try:
        cursor.execute("INSERT INTO seen_filings (accession_no, unix_number) VALUES (?, ?)",
                   (filing_acc_no, filing_time))

        conn.commit()
        
    except:
        continue
        

cursor.execute("SELECT accession_no, unix_number FROM seen_filings")
seen = cursor.fetchall()
all_accession_numbers = {row[0] for row in seen}
max_unix_number = max({row[1] for row in seen})
    
conn.close()
    
time.time() - x

True http://sec.gov/Archives/edgar/data/878828/000149315223020861/form8-k.htm
True http://sec.gov/Archives/edgar/data/1826667/000110465923070148/tm2318362d1_8k.htm
True http://sec.gov/Archives/edgar/data/1435049/000119312523164891/d511815d8k.htm
True http://sec.gov/Archives/edgar/data/1120193/000119312523164839/d476077d8k.htm
True http://sec.gov/Archives/edgar/data/1854583/000121390023047889/ea180116-8k425_abrispac1.htm
True http://sec.gov/Archives/edgar/data/1281845/000149315223020769/form8-k.htm


163.22192072868347

In [21]:
requests.get(r"http://127.0.0.1:8000/data/").text

'{"item_0": {"accession_no": "text1", "cik": "text2", "unix_number": 123}, "item_1": {"accession_no": "12412", "cik": "214", "unix_number": 643}}'

In [15]:
import os
os.getcwd().split("scraper")[0] + "8ks"

'C:\\Users\\sbuca\\Desktop\\code_post_grad\\merger_arb\\8ks'

In [31]:
max_unix_number

1686581368

In [26]:

cursor.execute("SELECT accession_no, unix_number FROM seen_filings")
seen = cursor.fetchall()
all_accession_numbers = {row[0] for row in seen}
max_unix_number = max({row[1] for row in seen})

In [27]:
seen

[('12412', 643)]

In [28]:
max_unix_number

643

In [38]:
import re
import time
import requests
from bs4 import BeautifulSoup
import openai
import pdfkit
import sqlite3
import os 
from datetime import datetime

def get_all_filings(soup):
    #Go through all filings with html text and accerssion
    filings = []
    for row in soup.findAll("tr"):
        if "Accession Number" in row.text:
            filings.append(row)
            
    return filings


def get_acc_no(text):
    # Extract the accession number using regex
    match = re.search(r"Accession Number: (\d{10}-\d{2}-\d{6})", text)
    if match:
        accession_number = match.group(1)
        return (accession_number)
    
    
def get_filing_metadata(filing):
    for links in filing.findAll('a'):
        href = links['href']

        if ".htm" in href:
            #"click" on link (just request that link)
            x = requests.get(r"http://sec.gov" + href, headers=headers)

            
    soup = BeautifulSoup(x.text, "html.parser")
    cik = re.search(r"CIK:\s+(\d+)", soup.text).group(1)
    
    for _ in soup.findAll('a'):
        if "ix?doc=" in _['href']:
            partial_link = _['href'].split("/ix?doc=")[-1]

            filing_link = "http://sec.gov" + partial_link
            
            return filing_link, cik
        
        
def get_filing_time(filing):
    time_data = filing.findAll('td')[3]
    date = time_data.contents[0]
    time = time_data.contents[2]
    
    datetime_obj = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S')
    unix_time = int(datetime_obj.timestamp())
    
    return unix_time
        
    
def get_filing(filing_link):
    raw_filing = BeautifulSoup(requests.get(filing_link, headers=headers).text, "html.parser").find("body").text
    filing = clean_filing(raw_filing)
    
    return filing
        
        
def clean_filing(raw_filing):
    filing = raw_filing.replace("\n", " ").replace("\xa0", " ").strip()
    filing = " ".join(filing.split())


    filing = "UNITED STATES SECURITIES AND EXCHANGE COMMISSION" + \
            filing.split("UNITED STATES SECURITIES AND EXCHANGE COMMISSION")[-1]
    
    return filing.lower()


def is_filing_merger(filing_text):
    #Need to make more solid determination
    if "merger" not in filing_text:
        return False
    
    if "item 1.01".lower() in filing_text:
        return True
    
    if "item 7.01".lower() in filing_text:
        return True
    
    return False

    
headers = {
"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}

base_url = r"https://www.sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=8-K&owner=include&count=100&action=getcurrent"


current_dir = os.getcwd() + "\\" 


path_wkhtmltopdf = current_dir.split("scraper")[0] + "wkhtmltopdf\\bin\\wkhtmltopdf.exe"
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)

DB_PATH = current_dir.split("scraper")[0] + "database\\filing_data.sqlite3"

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()


while True:
    latest_8k_filings = requests.get(base_url, headers=headers).text
    soup = BeautifulSoup(latest_8k_filings, "html.parser")

    #Iterate through all filings on page
    filings = get_all_filings(soup)

    x = time.time()


    cursor.execute("SELECT accession_no, unix_number FROM seen_filings")
    seen = cursor.fetchall()
    all_accession_numbers = {row[0] for row in seen}
    max_unix_number = max({row[1] for row in seen})

    for filing in filings:
        filing_acc_no = get_acc_no(filing.findAll('td')[2].text)

        if filing_acc_no in all_accession_numbers:
            continue

        filing_link, company_cik = get_filing_metadata(filing)
        filing_time = get_filing_time(filing)


        if max_unix_number > filing_time:
            continue


        filing_text = get_filing(filing_link)

        if is_filing_merger(filing_text.lower()):
            print(True, filing_link)


            try:
                #Store metadata to db
                cursor.execute("INSERT INTO data (accession_no,cik, unix_number) VALUES (?, ?, ?)",
                               (filing_acc_no, company_cik, filing_time))

                # Commit the changes to the database
                conn.commit()

            except:
                continue


            #Save as pdf to 8k folder
            filings_path = current_dir.split("scraper")[0] + "8ks\\"
            filing_path = filings_path + filing_acc_no + ".pdf"
            pdfkit.from_url(filing_link, filing_path, configuration=config)


        try:
            cursor.execute("INSERT INTO seen_filings (accession_no, unix_number) VALUES (?, ?)",
                       (filing_acc_no, filing_time))

            conn.commit()

        except:
            continue


    cursor.execute("SELECT accession_no, unix_number FROM seen_filings")
    seen = cursor.fetchall()
    all_accession_numbers = {row[0] for row in seen}
    max_unix_number = max({row[1] for row in seen})

KeyboardInterrupt: 

In [36]:
max_unix_number

1686584304

In [39]:
filings_path

'C:\\Users\\sbuca\\Desktop\\code_post_grad\\merger_arb\\8ks'

In [40]:
filing_path

'C:\\Users\\sbuca\\Desktop\\code_post_grad\\merger_arb\\8ks0001493152-23-020769.pdf'

In [41]:

    cursor.execute("SELECT accession_no, unix_number FROM seen_filings")
    seen = cursor.fetchall()
    all_accession_numbers = {row[0] for row in seen}
    max_unix_number = max({row[1] for row in seen})

TypeError: '>' not supported between instances of 'int' and 'str'

In [43]:
conn.close()