## Step 1: Get Index, Scrape Data Since Last Time We Scraped

In [1]:
import edgar
import pandas as pd 

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:

#ua = [First Name, Last Name, email]

edgar.download_index('/Users/rileybitterli/Desktop/StockBot_Data/StockBot_Indices', 2023, ua, skip_all_present_except_last=True)



In [4]:
#Get Current Year and Quarter for daily scraping purposes
from datetime import datetime

def get_current_year_and_quarter():
    current_date = datetime.now()
    year = current_date.year
    month = current_date.month

    if 1 <= month <= 3:
        quarter = "QTR1"
    elif 4 <= month <= 6:
        quarter = "QTR2"
    elif 7 <= month <= 9:
        quarter = "QTR3"
    else:
        quarter = "QTR4"

    return year, quarter

year, quarter = get_current_year_and_quarter()
print(f"year = {year}")
print(f"quarter = {quarter}")


year = 2024
quarter = QTR3


In [5]:
import re

file_path = f'/Users/rileybitterli/Desktop/StockBot_Data/StockBot_Indices/{year}-{quarter}.tsv'
# Define a regex pattern to match the year and the number after QTR
match = re.search(r'(\d{4})-QTR(\d)', file_path)

if match:
    filing_year = int(match.group(1))
    filing_qtr = 'Q' + match.group(2)  # prepend 'Q' to the quarter number
else:
    print("Pattern not found in the provided string")

print(file_path)

/Users/rileybitterli/Desktop/StockBot_Data/StockBot_Indices/2024-QTR3.tsv


In [6]:
index_df = pd.read_csv(file_path, delimiter= '|', names = ('CIK', 'Name', 'Form_Type', 'Date', 'URL_TXT', 'URL_HTML'))

index_df = index_df[index_df['Form_Type']=='4']

In [7]:
index_df['FileURL'] = 'https://sec.gov/Archives/' + index_df['URL_HTML']
index_df.reset_index(inplace = True)

In [8]:
def read_last_date(file_path):
    try:
        with open(file_path, 'r') as file:
            last_date = file.readline().strip()
            return last_date
    except FileNotFoundError:
        return None

def write_last_date(file_path, date):
    with open(file_path, 'w') as file:
        file.write(date)


In [14]:
# Create a temp file that's overwritten each day the script runs
date_file_path = '/Users/rileybitterli/Documents/GitHub/SB_VIII_Streamlined/Daily_Workflow/Temp_Files/last_run_date.txt'

# read the last stored date
last_stored_date = read_last_date(date_file_path)

#c onvert last_stored_date to datetime
last_stored_date = pd.to_datetime(last_stored_date)

# ensure the 'Date' column is datetime
index_df['Date'] = pd.to_datetime(index_df['Date'])

# filter the df
since_last_run = index_df[index_df['Date'] >= last_stored_date]

# Now, filtered_df contains only the rows with Date >= last_stored_date


In [15]:
last_stored_date

Timestamp('2024-07-29 00:00:00')

In [16]:
#now we've got a list of the urls since last time we scraped, as well as rewritten the new date to the file

In [17]:
from selenium import webdriver
import concurrent.futures
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup 
from random import uniform
from time import sleep
import lxml
from bs4.formatter import HTMLFormatter
import pandas as pd
import pickle
import os
import smtplib
from email.message import EmailMessage
import threading
from multiprocessing import Pool
import more_itertools

In [18]:
# create a function to send you emails- when the job completes, errors, etc.
def send_email(subject, content):
    msg = EmailMessage()
    msg.set_content(content)
    msg['Subject'] = subject
    msg['From'] = 'halpitsstockbot@gmail.com'
    msg['To'] = 'riley.bitterli@gmail.com'

    # Establish a connection to Gmail
    server = smtplib.SMTP('smtp.gmail.com', 587)
    server.starttls()
    #server.login([insert email address], [insert password])
    server.send_message(msg)
    server.quit()


In [57]:
chrome_path = ChromeDriverManager().install()

In [59]:
#handles weird edge case of chromedriver
if "THIRD_PARTY_NOTICES.chromedriver" in chrome_path:
    chrome_path = chrome_path.replace("THIRD_PARTY_NOTICES.chromedriver", "chromedriver")

In [60]:
s = Service(chrome_path)


In [61]:
chrome_options = Options()
chrome_options.add_argument("--disable-background-timer-throttling")
chrome_options.add_argument("--disable-backgrounding-occluded-windows")
chrome_options.add_argument("--disable-renderer-backgrounding")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-gpu") 
chrome_options.add_argument("--enable-accelerated-2d-canvas")
chrome_options.add_argument("--ignore-gpu-blacklist")
chrome_options.add_argument("--no-sandbox")  # This can help performance but reduces security

## Step 2: Cut Insider Trading Links into Chunks for MultiProcessing

In [62]:
# convert DataFrame columns into a list of tuples
url_date_pairs = list(zip(since_last_run['FileURL'], since_last_run['Date']))

# define  chunk size
chunk_size = 100

# create chunks
chunks = list(enumerate([url_date_pairs[i:i + chunk_size] for i in range(0, len(url_date_pairs), chunk_size)]))

# now, `chunks` is a list of indexed tuples, where each tuple contains an index and a list of up to 100 URL-Date pairings


In [63]:
len(chunks)

89

In [64]:
last_pull_string = last_stored_date.strftime("%Y-%m-%d")

In [65]:
current_date = datetime.now()

date_of_scrape = current_date.strftime("%Y-%m-%d")

In [66]:
date_of_scrape

'2024-08-08'

In [67]:
folder_path = f"/Users/rileybitterli/Documents/GitHub/SB_VIII_Streamlined/Daily_Workflow/Daily_Scrape_Pickles/{last_pull_string}-->{date_of_scrape}/"

# check if the folder exists, and create it if it doesn't
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder created: {folder_path}")
else:
    print(f"Folder already exists: {folder_path}")

Folder already exists: /Users/rileybitterli/Documents/GitHub/SB_VIII_Streamlined/Daily_Workflow/Daily_Scrape_Pickles/2024-07-29-->2024-08-08/


In [69]:
## This the biggest lift- we're multiprocessing the links to speed up pull time, pulling data we need, and saving it

import threading
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
from random import uniform
import pandas as pd
import pickle
import concurrent.futures

# Assuming 's', 'send_email', 'filing_year', 'filing_qtr', and 'chunks' are defined elsewhere

# For thread safety
batch_number_counter = [0]  # Using a list so it's mutable
batch_number_lock = threading.Lock()

def scrape_chunk(chunk_number, chunk):
    global batch_number_counter
    necessary_dfs = []  # Corrected list name for consistency
    # Setup a new webdriver instance
   # driver = webdriver.Chrome(service=s, options = chrome_options)
    driver = webdriver.Chrome(service = s, options = chrome_options)
    driver.delete_all_cookies()
    index = 0
    url_counter = 0
    with batch_number_lock:
        batch_number = batch_number_counter[0]
        batch_number_counter[0] += 1

    for url, date in chunk:
        index += 1
        try: 
            sec = uniform(.4,1)
            print("I slept for:", round(sec, 2), "seconds, and my index is:", index, "url counter is:", url_counter, 'batch number', batch_number, "chunk_number:", chunk_number)
            driver.get(url)
            sleep(sec)
            web_elements = driver.find_elements(By.TAG_NAME, "a")
            links = [we.get_attribute("href") for we in web_elements if we.get_attribute("href")]
            html_links = [link for link in links if 'xml' in link]
    
            if html_links:
                driver.get(html_links[0])
                full_page = driver.page_source.replace('%', '')
                dfs = pd.read_html(full_page)

        
                for df in dfs:
                   if isinstance(df, pd.DataFrame):
                    try:
                        # Ensure we're working with a string for the regex search
                        first_item = str(df.iloc[0, 1]) if not df.empty and df.columns.size > 1 else ""
                        bracket_match = re.search(r'\[(.*?)\]', first_item)
                        if bracket_match:
                            ticker = bracket_match.group(1)  # Extract the value between the brackets
                            name = df.iloc[1, 1]
                    except Exception as e:
                        # If any error occurs, ignore and move on
                        pass 

                    if isinstance(df.columns, pd.MultiIndex):
                        for col in df.columns.levels[0]:
                            if col.startswith('Table I - Non-Derivative Securities Acquired, Disposed of, or Beneficially Owned'):
                                df.columns = df.columns.droplevel(0).droplevel(0)
                                df.rename(columns={
                                        '1. Title of Security (Instr. 3)': 'security_type',
                                        '2. Transaction Date  (Month/Day/Year)': 'date',
                                        '2A. Deemed Execution Date, if any  (Month/Day/Year)': 'execution_date',
                                        'Code': 'code',
                                        'V': 'v',
                                        '(A) or (D)': 'a_or_d',
                                        'Amount': 'amount',
                                        '5. Amount of Securities Beneficially Owned Following Reported Transaction(s) (Instr. 3 and 4)': 'total_owned_after_trans',
                                        '6. Ownership Form: Direct (D) or Indirect (I) (Instr. 4)': 'direct_or_indirect'
                                    }, inplace=True)

                                df['URL'] = url
                                df['Filing Date'] = date
                                df['Ticker'] = ticker
                                df['Name'] = name

                                if 'code' in df.columns and (df['code'].eq('P').any() or df['code'].eq('S').any()):
                                    necessary_dfs.append(df)
                                        
                                        
                                        
                url_counter += 1
                if url_counter == 1000:
                     with open(f"{folder_path}batch{batch_number}.pkl", "wb") as file:
                        pickle.dump(necessary_dfs, file)
                        necessary_dfs = []
                        url_counter = 0
                        
                        
        except IndexError as ie:
            print(f"Skipped URL due to error at index:{index}. Error: {str(ie)}")
            continue
                        
        except ValueError:
            continue

        except Exception as e:
            if "Document is empty" in str(e):
                print(f"Skipped URL due to error at index:{index}. Error: {str(e)}")
                continue
            else:
                send_email('Error in Jupyter Notebook', f"Broke at index:{index} An error occurred: {str(e)}")
                raise e

    if necessary_dfs:
        with open(f"{folder_path}batch{batch_number}.pkl", "wb") as file:
            pickle.dump(necessary_dfs, file)
            necessary_dfs = []
            url_counter = 0
    print('all done')
    driver.close() 

if __name__ == '__main__':
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(scrape_chunk, chunk[0], chunk[1]) for chunk in chunks]
        concurrent.futures.wait(futures)


I slept for: 0.69 seconds, and my index is: 1 url counter is: 0 batch number 0 chunk_number: 2
I slept for: 0.82 seconds, and my index is: 1 url counter is: 0 batch number 1 chunk_number: 1
I slept for: 0.57 seconds, and my index is: 1 url counter is: 0 batch number 2 chunk_number: 0



KeyboardInterrupt



In [26]:
# once the scraping is done, re-set the most recent date to be the day of the scraping (today)
from datetime import datetime
current_date = datetime.now().strftime('%Y-%m-%d')
write_last_date(date_file_path, current_date)

In [27]:
folder_path

'/Users/rileybitterli/Documents/GitHub/SB_VIII_Streamlined/Daily_Workflow/Daily_Scrape_Pickles/2024-06-25-->2024-06-25/'

In [28]:
# create temp file to reference in next step
file_path = "/Users/rileybitterli/Documents/GitHub/SB_VIII_Streamlined/Daily_Workflow/Temp_Files/step_1_to_2.txt"


# create or overwrite the file and write the contents of the string variable to it
with open(file_path, "w") as file:
    file.write(folder_path)


In [29]:
send_email('completed', f'step 1 of daily for{current_date}')