In [82]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

class LobbyingDataPage:
    lobbying_file = '../data/lobbying.csv'
    compensation_file = '../data/compensation.csv'

    def __init__(self, html):
        self.html = html
        self.soup = bs(self.html,'html.parser')
        self.company_name = self.get_company_name()
        self.date_range = self.get_date_range()
        
        self.lobbying_data = self.extract_lobbying_data()
        self.compensation_data = self.extract_compensation_data()

    def get_date_range(self):
        return self.soup.find('span', {'id': 'ContentPlaceHolder1_lblYear'}).text

    def get_company_name(self):
        return self.soup.find('span', {'id': 'ContentPlaceHolder1_ERegistrationInfoReview1_lblEntityCompany'}).text

    def prep_tables(self):
        some_tables = self.soup.find_all('tr', {'style': 'vertical-align: top'})
        #Extract tables that contain the word 'lobbyist' and split at that word
        split_tables = [table for table in some_tables if 'Lobbyist: ' in table.text][0].text.split('Lobbyist: ')
        #Strip out junk
        the_tables = [entry for entry in split_tables if entry.strip() and 'House / Senate' in entry]

        clean_tables = []
        for table in the_tables:
            clean_table = [line for line in table.split('\n') if line] # divide by lines and remove empties
            clean_table = clean_table[:clean_table.index('\xa0\xa0\xa0')] # Remove ending cruft
            clean_tables.append(clean_table)

        return clean_tables
    
    def extract_lobbying_data(self):
        clean_tables = self.prep_tables()
        row_dicts = []
        
        for table in clean_tables:
            lobbyist_name = table[0]
            client_name = table[2]
            table_start_index = table.index('House / SenateBill Number or Agency NameBill title or activityAgent positionAmountDirect business association')+1
            table_data = table[table_start_index:]

            i=0
            while i <= len(table_data)-7:
                row_dicts.append({'LobbyingEntity': self.company_name,
                                'DateRange': self.date_range,
                                'Lobbyist': lobbyist_name, 
                                'Client': client_name, 
                                'House/Senate': table_data[i].strip(), 
                                'BillNumber':table_data[i+1].strip(), 
                                'BillActivity':table_data[i+2].strip(),
                                'AgentPosition': table_data[i+3].strip(), 
                                'Amount': table_data[i+5].strip(), 
                                'DirectBusinessAssosciation': table_data[i+7].strip()})
                i=i+8

        return pd.DataFrame(row_dicts)

    def extract_compensation_data(self):
        compensation_table = self.soup.find('table', {'id': 'ContentPlaceHolder1_DisclosureReviewDetail1_grdvClientPaidToEntity'})
        temp_list = [line.strip() for line in compensation_table.text.split('\n') if line.strip()][1:-2]

        temp_dict_list = []
        it = iter(temp_list)
        for i in it:
            temp_dict_list.append({'LobbyingEntity': self.company_name, 'DateRange':self.date_range, 'Client': i, 'Amount':next(it)})

        return pd.DataFrame(temp_dict_list)    
    
    def save(self):
        self.write_data(LobbyingDataPage.lobbying_file, self.lobbying_data)
        self.write_data(LobbyingDataPage.compensation_file, self.compensation_data)

    def write_data(self, file_path, dataframe):
        write = True
        if os.path.exists(file_path):
            with open(file_path, mode = 'r') as f:
                for line in f:
                    if self.company_name in line and self.date_range in line:
                        print('Data already present in ' + file_path)
                        write = False
                        break
        if write:
            dataframe.to_csv(file_path, mode ='a',header=not os.path.exists(file_path), index=False)

In [83]:
def extract_and_save(html_list):
    for html in html_list:
        LobbyingDataPage(html).save()

def pull_data(url):
    headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"}
    result = requests.get(url, headers=headers)
    result.raise_for_status()
    return result.content

def download_html_list(url_list):
    html_list = []
    for url in url_list:
        html_list.append(pull_data(url))
    return html_list

def save_data_from_url_list(url_list):
    extract_and_save(download_html_list(url_list))

In [116]:
url_list = ['https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=Tcg7Il3rjW5sIbUrwbcVKSmBfkLmYFdnf8xTcXfPeaK3i269EErp97So5FUc/X23',
            'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=Tcg7Il3rjW5sIbUrwbcVKV8k15XmkYG2WIUeIEwL/REZBWdbh+QzQDTofQB05tTR',
            'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=Tcg7Il3rjW5sIbUrwbcVKVAoK2CuqlDEJoqp2Wyr2sQxnlrGVUJ/4Bp4td8mYpGn']

In [117]:
save_data_from_url_list(url_list)

Data already present in ../data/lobbying.csv
Data already present in ../data/compensation.csv
Data already present in ../data/lobbying.csv
Data already present in ../data/compensation.csv


In [89]:
lobbying_df = pd.read_csv('../data/lobbying.csv')
compensation_df = pd.read_csv('../data/compensation.csv')

In [95]:
compensation_df

Unnamed: 0,LobbyingEntity,DateRange,Client,Amount
0,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,American Fidelity Corp.,"$39,000.00"
1,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Just Healthy LLC,"$54,000.00"
2,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,"Hometown America, L.L.C.","$39,000.00"
3,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,A Day for Democracy,"$37,500.00"
4,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Dismas House of Massachusetts,"$6,000.00"
...,...,...,...,...
61,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,"Valley Eye Radio, Inc.","$6,000.00"
62,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Zeiterion Theatre Inc,"$16,250.00"
63,"21c, LLC",01/01/2022 - 06/30/2022,Benjamin Franklin Institute of Technology,"$30,000.00"
64,"21c, LLC",01/01/2022 - 06/30/2022,"EDUCATION TRUST, INC. (THE)","$6,000.00"


In [97]:
compensation_df[compensation_df.Client == 'Just Healthy LLC']

Unnamed: 0,LobbyingEntity,DateRange,Client,Amount
1,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Just Healthy LLC,"$54,000.00"


In [103]:
lobbying_df[lobbying_df.AgentPosition == 'Oppose']

Unnamed: 0,LobbyingEntity,DateRange,Lobbyist,Client,House/Senate,BillNumber,BillActivity,AgentPosition,Amount,DirectBusinessAssosciation
170,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Benjamin Josephson,Apple Inc.,House Bill,140,An Act promoting competition among app stores,Oppose,$0.00,
171,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Benjamin Josephson,Apple Inc.,House Bill,341,An Act relative to the digital right to repair,Oppose,$0.00,
186,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Benjamin Josephson,Apple Inc.,Senate Bill,166,An Act relative to Digital Right to Repair Act,Oppose,$0.00,
539,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Benjamin Josephson,"HDR Engineering, Inc.",House Bill,3951,An Act financing improvements to municipal roa...,Oppose,$0.00,
1069,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,James Dunbar,"HDR Engineering, Inc.",House Bill,3951,An Act financing improvements to municipal roa...,Oppose,$0.00,
...,...,...,...,...,...,...,...,...,...,...
2185,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Matthew Irish,Point32Health,House Bill,3881,An Act to increase access to nurse-midwifery s...,Oppose,$0.00,
2186,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Matthew Irish,Point32Health,House Bill,4034,An Act Relative to Insulin Access,Oppose,$0.00,
2187,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Matthew Irish,Point32Health,House Bill,4079,An Act relative to trans-inclusive health care...,Oppose,$0.00,
2188,O'Neill and Partners LLC.,01/01/2022 - 06/30/2022,Matthew Irish,Point32Health,House Bill,4145,An Act relative to colon cancer screening,Oppose,$0.00,


In [112]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def extract_links():
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    url = 'https://www.sec.state.ma.us/LobbyistPublicSearch/Default.aspx'
    
    driver.get(url)
    driver.find_element('id','ContentPlaceHolder1_rdbSearchByType').click()
    select = Select(driver.find_element(By.CLASS_NAME,'p3'))
    select.select_by_value('2020')
    driver.find_element('id','ContentPlaceHolder1_btnSearch').click()
    
    find_table = driver.find_element(By.ID,'ContentPlaceHolder1_ucSearchResultByTypeAndCategory_grdvSearchResultByTypeAndCategory')
    links = find_table.find_elements(By.TAG_NAME,'a')
    links_list = [l.get_attribute('href') for l in links if str(l.get_attribute('href')).startswith('javascript') == False]
    driver.quit()
    return links_list


In [113]:
links_list = extract_links()

[WDM] - Downloading: 100%|██████████| 6.46M/6.46M [00:00<00:00, 78.8MB/s]


In [115]:
links_list[0]

'https://www.sec.state.ma.us/LobbyistPublicSearch/Summary.aspx?sysvalue=7RqzbyUoNpsojnye4iOGrREsuxJs/L03D9yRarjaeWQ='