In [8]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

class LobbyingDataPage:
    lobbying_file = '../data/lobbying.csv'
    compensation_file = '../data/compensation.csv'
    contributions_file = '../data/contributions.csv'

    def __init__(self, html):
        self.html = html
        self.soup = bs(self.html,'html.parser')
        
        self.is_entity = bool(self.soup.find('span', {'id': 'ContentPlaceHolder1_ERegistrationInfoReview1_lblEntityCompany'}))

        self.company_name = self.get_company_name()
        self.date_range = self.get_date_range()
        
        if (self.soup.find('tr', {'class': 'GridHeader'})):
            self.lobbying_data = self.extract_lobbying_data()
            self.compensation_data = self.extract_compensation_data()
            self.contributions_data = self.extract_contributions_data()

        else:
            self.lobbying_data = pd.DataFrame()
            self.compensation_data = pd.DataFrame()
            self.contributions_data = pd.DataFrame()

    def get_date_range(self):
        return self.soup.find('span', {'id': 'ContentPlaceHolder1_lblYear'}).text

    def get_company_name(self):
        if self.is_entity:
            return self.soup.find('span', {'id': 'ContentPlaceHolder1_ERegistrationInfoReview1_lblEntityCompany'}).text
        else:
            return self.soup.find('span', {'id': 'ContentPlaceHolder1_LRegistrationInfoReview1_lblLobbyistCompany'}).text

    def prep_tables(self):
        some_tables = self.soup.find_all('tr', {'style': 'vertical-align: top'})
        
        #Extract tables that contain the word 'lobbyist' and split at that word
        if 'Lobbyist name' in some_tables[0].text:
            split_tables = [table for table in some_tables if 'Client: ' in table.text][0].text.split('Client: ')
        else:
            split_tables = [table for table in some_tables if 'Lobbyist: ' in table.text][0].text.split('Lobbyist: ')
        #Strip out junk
        the_tables = [entry for entry in split_tables if entry.strip() and 'House / Senate' in entry]

        clean_tables = []
        for table in the_tables:
            clean_table = [line for line in table.split('\n') if line] # divide by lines and remove empties
            clean_table = clean_table[:clean_table.index('\xa0\xa0\xa0')] # Remove ending cruft
            clean_tables.append(clean_table)

        return clean_tables
    
    def extract_lobbying_data(self):
        if self.soup.find('span', {'id': 'ContentPlaceHolder1_LRegistrationInfoReview1_lblIncidental'}):
            return pd.DataFrame()
        clean_tables = self.prep_tables()
        row_dicts = []
        
        for table in clean_tables:
            lobbyist_name = table[0].strip()
            client_name = table[2].strip()
            table_start_index = table.index('House / SenateBill Number or Agency NameBill title or activityAgent positionAmountDirect business association')+1
            table_data = table[table_start_index:]

            i=0
            while i <= len(table_data)-8:
                row_dicts.append({'LobbyingEntity': self.company_name,
                                'DateRange': self.date_range,
                                'Lobbyist': lobbyist_name, 
                                'Client': client_name, 
                                'House/Senate': table_data[i].strip(), 
                                'BillNumber':table_data[i+1].strip(), 
                                'BillActivity':table_data[i+2].strip(),
                                'AgentPosition': table_data[i+3].strip(), 
                                'Amount': table_data[i+5].strip(), 
                                'DirectBusinessAssosciation': table_data[i+7].strip()})
                i=i+8
        return pd.DataFrame(row_dicts)

    def extract_contributions_data(self):

        bad_data = [element.split("Lobbyist: ")[0] for element in self.soup.text.split('Campaign Contributions') if "DateLobbyist nameRecipient nameOffice soughtAmount" in element]
        if not bad_data:
            print("NO DATA")
        pass1 = [element.split('Total contributions')[0] for element in bad_data]
        pass2 = [element.split('soughtAmount\n\n')[1:][0] for element in pass1]
        pass3 = "".join(pass2)
        data = [element.strip() for element in pass3.split('\n') if element.strip()]

        i = 0
        row_dicts = []
        while i < len(data):
            date = data[i].split()[0]
            lobbyist = " ".join(data[i].split()[1:])
            recipient = data[i+1]
            office = data[i+2]
            amount = data[i+3]
            row_dicts.append({  'LobbyingEntity': self.company_name,
                                'DateRange': self.date_range,
                                'Date': date,
                                'LobbyistName': lobbyist,
                                'RecipientName': recipient,
                                'OfficeSought': office,
                                'Amount': amount})
            i=i+4

        return pd.DataFrame(row_dicts)

    def extract_compensation_data(self):
        compensation_table = self.soup.find('table', {'id': 'ContentPlaceHolder1_DisclosureReviewDetail1_grdvClientPaidToEntity'})
        if not bool(compensation_table):
            return pd.DataFrame()
        temp_list = [line.strip() for line in compensation_table.text.split('\n') if line.strip()][1:-2]

        temp_dict_list = []
        for entry in temp_list:
            if entry[0] != '$':
                client_name = entry
            else:
                temp_dict_list.append({'LobbyingEntity': self.company_name, 'DateRange':self.date_range, 'Client': client_name, 'Amount':entry})
        return pd.DataFrame(temp_dict_list)
        
    def save(self):
        if not self.lobbying_data.empty:
            self.write_data(LobbyingDataPage.lobbying_file, self.lobbying_data)
        if not self.compensation_data.empty:
            self.write_data(LobbyingDataPage.compensation_file, self.compensation_data)
        if not self.contributions_data.empty:
            self.write_data(LobbyingDataPage.contributions_file, self.contributions_data)

    def write_data(self, file_path, dataframe):
        write = True
        if os.path.exists(file_path):
            with open(file_path, mode = 'r', encoding = 'utf-8') as f:
                for line in f:
                    if self.company_name in line and self.date_range in line:
                        print('Data already present in ' + file_path)
                        write = False
                        break

        if write and type(dataframe) == pd.DataFrame:
            print('Saving data to ' + file_path)
            dataframe.to_csv(file_path, mode ='a+',header=(not os.path.exists(file_path)), index=False)

In [2]:
def extract_and_save(html_list):
    #for html in html_list:
        #LobbyingDataPage(html).save()
    for i in range(len(html_list)):
        print("Saving "+str(i))
        LobbyingDataPage(html_list[i]).save()

def pull_data(url):
    headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"}
    result = requests.get(url, headers=headers)
    result.raise_for_status()
    return result.content

def download_html_list(url_list):
    html_list = []
    for url in url_list:
        print("Pulling data from " + url)
        html_list.append(pull_data(url))
    return html_list

def save_data_from_url_list(url_list):
    extract_and_save(download_html_list(url_list))

In [35]:
lobbying_df = pd.read_csv('../data/lobbying.csv')
compensation_df = pd.read_csv('../data/compensation.csv')

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
def extract_client_links():
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    url = 'https://www.sec.state.ma.us/LobbyistPublicSearch/Default.aspx'
    
    driver.get(url)
    
    driver.find_element('id','ContentPlaceHolder1_rdbSearchByType').click()
    select = Select(driver.find_element(By.CLASS_NAME,'p3'))

    select.select_by_value('2020')
    Select(driver.find_element('id','ContentPlaceHolder1_ucSearchCriteriaByType_drpType')).select_by_value('L')
    driver.find_element('id','ContentPlaceHolder1_btnSearch').click()
    
    find_table = driver.find_element(By.ID,'ContentPlaceHolder1_ucSearchResultByTypeAndCategory_grdvSearchResultByTypeAndCategory')
    links = find_table.find_elements(By.TAG_NAME,'a')
    links_list = [l.get_attribute('href') for l in links if str(l.get_attribute('href')).startswith('javascript') == False]
    driver.quit()
    return links_list
def extract_disclosures(list_of_links):
    disclosure_reports = []

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    for link in list_of_links:
    # print(link)
        driver.get(link)
        all_links = driver.find_elements(By.CLASS_NAME,'BlueLinks')
        disclosure_links = [l.get_attribute('href') for l in all_links if 'CompleteDisclosure' in l.get_attribute('href')]
        for dl in disclosure_links:
            disclosure_reports.append(dl)
    driver.quit()
        
    return disclosure_reports



In [4]:
disclosure_links = extract_disclosures(extract_client_links())

[WDM] - Downloading: 100%|██████████| 6.46M/6.46M [00:00<00:00, 82.6MB/s]


In [5]:
html_list = download_html_list(disclosure_links)

Pulling data from https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oUX2F0/qMX8aZhXGSqISnPo81sWNBWPRVYkBCJOoiSOC
Pulling data from https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oaRAJOySB29mALIUXhzqvSkbGEUA7hlbESfh6Yf37ZPh
Pulling data from https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8ofMFRTo+Nheb8NE91QgT9ChOIrFe4b7tgHM7C/BwTEcT
Pulling data from https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oQ9Uiy5y/+5qZTup4r4QIpMV1yz7jEwveszSLKbPMHbY
Pulling data from https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oSXfp14ycsC4C75XzUXuOD0RNTxP5RQlQYtqqNlG19gK
Pulling data from https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8ocpx6Xk2XZJefL/lfN0grB1EQuOQdkkYQOrWTrkijHgZ
Pulling da

In [9]:
extract_and_save(html_list)

Saving 0
NO DATA
Saving data to ../data/lobbying.csv
Data already present in ../data/compensation.csv
Saving 1
NO DATA
Saving data to ../data/lobbying.csv
Data already present in ../data/compensation.csv
Saving 2
NO DATA
Saving data to ../data/lobbying.csv
Data already present in ../data/compensation.csv
Saving 3
NO DATA
Saving data to ../data/lobbying.csv
Data already present in ../data/compensation.csv
Saving 4
Saving data to ../data/lobbying.csv
Data already present in ../data/compensation.csv
Data already present in ../data/contributions.csv
Saving 5
Saving data to ../data/lobbying.csv
Data already present in ../data/compensation.csv
Data already present in ../data/contributions.csv
Saving 6
NO DATA
Saving data to ../data/lobbying.csv
Saving 7
NO DATA
Saving data to ../data/lobbying.csv
Saving 8
Saving 9
Saving 10
NO DATA
Saving data to ../data/lobbying.csv
Saving 11
NO DATA
Saving data to ../data/lobbying.csv
Saving 12
NO DATA
Saving data to ../data/lobbying.csv
Saving 13
NO DATA


In [246]:
disclosure_links[3]

'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oQ9Uiy5y/+5qZTup4r4QIpMV1yz7jEwveszSLKbPMHbY'