# Scraping Capitol Trades


## Import Modules

In [2]:
import pandas as pd  # For handling data in DataFrame
from selenium import webdriver  # For controlling the web browser and interacting with HTML content
from selenium.webdriver.chrome.service import Service  # For setting up ChromeDriver as a service
from selenium.webdriver.common.by import By  # To locate HTML elements by XPath
from selenium.webdriver.chrome.options import Options  # For handling Chrome options
from webdriver_manager.chrome import ChromeDriverManager  # For automatic ChromeDriver installation
import time  # For handling wait times
import random  # For randomizing scroll and wait times
import os
from datetime import datetime

### Set Up Paths

In [3]:
home_path = os.getcwd()
home_path = os.path.dirname(home_path)
home_path += "\\"

data_path = home_path + "02 Data Files\\"

print(data_path)

c:\Users\rjrul\OneDrive - University of Iowa\00 Current Semester\01 BAIS 3250 - Data Wrangling\05 Final Project\02 Data Files\


## Set Up Scraping Browser

In [4]:
def get_browser():# Set up ChromeDriver using webdriver_manager
    chrome_options = Options()  # Initialize Chrome options (optional)

    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-images")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-javascript")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")

    service = Service(ChromeDriverManager().install())  # Install and set up ChromeDriver as a service

    # Create a ChromeDriver instance
    browser = webdriver.Chrome(service=service, options=chrome_options)

    return browser

In [5]:
# Function to scroll down the page randomly
def random_scroll(browser, total_wait_time=3):
    # Get the total height of the page
    total_height = browser.execute_script("return document.body.scrollHeight")
    
    # Number of steps to scroll (adjust this number for finer control)
    scroll_steps = random.randint(3, 7)
    
    # Height to scroll per step
    scroll_increment = total_height // scroll_steps
    
    # Time to wait per step
    time_per_step = total_wait_time / scroll_steps
    
    # Scroll in steps
    for step in range(scroll_steps):
        # Scroll down
        browser.execute_script(f"window.scrollBy(0, {scroll_increment});")
        # Wait for a random time between each scroll
        time.sleep(random.uniform(0.5 * time_per_step, 1.5 * time_per_step))
    
    # Ensure we scroll to the bottom of the page
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

## Scrape Data

In [6]:
def scrape(start, end, subset=1):
    url = 'https://www.capitoltrades.com/trades?page='

    #Initialize Next Flag and Page Counter
    next_flag = True
    page_counter = start #Start Page
    first_page = page_counter
    max_pages = end-start 

    max_pages += page_counter

    browser = get_browser()

    browser.get(url+str(page_counter))
    browser.minimize_window()
    # Wait a random time between 1-5 seconds before starting
    time.sleep(random.uniform(1, 5))

    month_dict = {
    "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6,
    "Jul": 7, "Aug": 8, "Sept": 9, "Oct": 10, "Nov": 11, "Dec": 12
    }



    #Master Dictionary to Track all Records
    master_dict = {}

    #Initialize Timing
    end_time = datetime.now()
    start_time = datetime.now()

    while next_flag == True and page_counter <= max_pages:
        #Look for Page Limit
        scrapetime = (end_time - start_time).total_seconds()
        start_time = datetime.now()
        

        if page_counter == first_page:
            p = browser.find_element(By.CSS_SELECTOR, ".hidden.leading-7.sm\\:block")
            bs = p.find_elements(By.TAG_NAME, "b")
            page_limit = bs[1].text
            page_limit = int(page_limit)
            page_limit = min(max_pages, page_limit)
            scrapetime = 0

        

        print(f"\nScraping Subset #{subset}, Page {page_counter-first_page+1} out of {page_limit-first_page+1} total pages ({round(scrapetime,1)}s)...") #Communicate to Terminal

        #Scrape

        try:
            table = browser.find_element(By.CSS_SELECTOR, ".w-full.caption-bottom.text-size-3.text-txt")

            body = table.find_element(By.TAG_NAME, "tbody")

            records = body.find_elements(By.TAG_NAME, "tr")

            for i, record in enumerate(records):
                record_dict = {}
                id = str(page_counter)+"-"+str(i)

                tds = record.find_elements(By.TAG_NAME, "td")
                
                
                #Scrape Name
                div1 = tds[0].find_element(By.TAG_NAME, "div")
                div2 = div1.find_element(By.TAG_NAME, "div")
                div3 = div2.find_element(By.TAG_NAME, "div")
                h2 = div3.find_element(By.TAG_NAME, "h2")
                a = h2.find_element(By.TAG_NAME, "a")
                name = a.text

                #Scrape Party, Chamber, State
                div4 = div3.find_element(By.TAG_NAME, "div")
                spans = div4.find_elements(By.TAG_NAME, "span")
                party = spans[0].text
                chamber = spans[1].text
                state = spans[2].text

                #Scrape Traded Ticker
                div1 = tds[1].find_element(By.TAG_NAME, "div")
                div2 = div1.find_element(By.TAG_NAME, "div")
                div3 = div2.find_element(By.TAG_NAME, "div")
                div4 = div3.find_element(By.TAG_NAME, "div")
                span = div4.find_element(By.TAG_NAME, "span")
                ticker = span.text.split(":")[0]

                #Scrape Trade Date
                div1 = tds[3].find_element(By.TAG_NAME, "div")
                div2 = div1.find_element(By.TAG_NAME, "div")
                divs = div2.find_elements(By.TAG_NAME, "div")
                day = divs[0].text.split(" ")[0]
                month = divs[0].text.split(" ")[1]
                month = month_dict[month]

                year = divs[1].text
                date = str(year)+"-"+str(month)+"-"+str(day)

                #Scrape Trade Type
                div1 = tds[6].find_element(By.TAG_NAME, "div")
                span = div1.find_element(By.TAG_NAME, "span")
                buy = span.text.lower() == 'buy'
                sell = span.text.lower() == 'sell'
                
                
                
                #Store Record Data in Record Dictionary
                record_dict["Name"] = name
                record_dict["Party"] = party
                record_dict["Chamber"] = chamber
                record_dict["State"] = state
                record_dict["Ticker"] = ticker
                record_dict["Date"] = date
                record_dict["Buy"] = buy
                record_dict["Sell"] = sell


                #Add Record to Master Dictionary
                master_dict[id] = record_dict
                
        except Exception as e:
            print("Error... Moving On")
            print(e)
            
        next_flag = page_counter<page_limit

        if next_flag == True:
            #Go to Next Page
            page_counter += 1 #Track Page
            browser.get(url+str(page_counter))
            random_scroll(browser)
        else:
            #Communicate to Terminal and End
            print("\nNo Next Page")

        time.sleep(random.uniform(.25, 1.5))
        end_time = datetime.now()

    #Close automated browser
    browser.close()

    df = pd.DataFrame.from_dict(master_dict, orient='Index')
    df = df.reset_index(drop=True)

    #df["Date_As_DateTime"] = pd.to_datetime(df["Date"])

    df.to_csv(data_path+"02 Trade Scraping\\"+f"transaction_log_scraped_subset{subset}.csv", index=False)

    df = 0

In [7]:
start = 1
end = 3000
step = 100

count = 15

curr_end = 1500


while curr_end < end:
    count += 1
    curr_start = curr_end+1
    scrape(curr_start, min(curr_start+step,end),count)
   
    curr_end += step






Scraping Subset #16, Page 1 out of 101 total pages (0s)...

Scraping Subset #16, Page 2 out of 101 total pages (7.5s)...

Scraping Subset #16, Page 3 out of 101 total pages (7.7s)...

Scraping Subset #16, Page 4 out of 101 total pages (21.8s)...

Scraping Subset #16, Page 5 out of 101 total pages (7.4s)...

Scraping Subset #16, Page 6 out of 101 total pages (7.8s)...

Scraping Subset #16, Page 7 out of 101 total pages (8.7s)...

Scraping Subset #16, Page 8 out of 101 total pages (10.6s)...

Scraping Subset #16, Page 9 out of 101 total pages (8.5s)...

Scraping Subset #16, Page 10 out of 101 total pages (8.5s)...

Scraping Subset #16, Page 11 out of 101 total pages (9.3s)...

Scraping Subset #16, Page 12 out of 101 total pages (25.0s)...

Scraping Subset #16, Page 13 out of 101 total pages (9.4s)...

Scraping Subset #16, Page 14 out of 101 total pages (26.0s)...

Scraping Subset #16, Page 15 out of 101 total pages (8.4s)...

Scraping Subset #16, Page 16 out of 101 total pages (8.0s)...