In [62]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

class LobbyingDataPage:
    lobbying_file = 'lobbying/data/lobbying.csv'
    compensation_file = 'lobbying/data/compensation.csv'
    contributions_file = 'lobbying/data/contributions.csv'

    def __init__(self, html):
        self.html = html
        self.soup = bs(self.html,'html.parser')
        if self.isValid():

            self.is_entity = bool(self.soup.find('span', {'id': 'ContentPlaceHolder1_ERegistrationInfoReview1_lblEntityCompany'}))

            self.company_name = self.get_company_name()
            self.date_range = self.get_date_range()

            if (self.soup.find('tr', {'class': 'GridHeader'})):
                self.lobbying_data = self.extract_lobbying_data()
                self.compensation_data = self.extract_compensation_data()
                self.contributions_data = self.extract_contributions_data()

        else:
            self.default_values()

    def isValid(self):
        if "An Error Occurred" in self.soup.text:
            return False

    def default_values(self):
            self.lobbying_data = pd.DataFrame()
            self.compensation_data = pd.DataFrame()
            self.contributions_data = pd.DataFrame()
            self.company_name = ''

    def get_date_range(self):
        return self.soup.find('span', {'id': 'ContentPlaceHolder1_lblYear'}).text

    def get_company_name(self):
        if self.is_entity:
            return self.soup.find('span', {'id': 'ContentPlaceHolder1_ERegistrationInfoReview1_lblEntityCompany'}).text
        else:
            return self.soup.find('span', {'id': 'ContentPlaceHolder1_LRegistrationInfoReview1_lblLobbyistCompany'}).text

    def prep_tables(self):
        some_tables = self.soup.find_all('tr', {'style': 'vertical-align: top'})

        #Extract tables that contain the word 'lobbyist' and split at that word
        if 'Lobbyist name' in some_tables[0].text:
            split_tables = [table for table in some_tables if 'Client: ' in table.text][0].text.split('Client: ')
        else:
            split_tables = [table for table in some_tables if 'Lobbyist: ' in table.text][0].text.split('Lobbyist: ')
        #Strip out junk
        the_tables = [entry for entry in split_tables if entry.strip() and 'House / Senate' in entry]

        clean_tables = []
        for table in the_tables:
            clean_table = [line for line in table.split('\n') if line] # divide by lines and remove empties
            clean_table = clean_table[:clean_table.index('\xa0\xa0\xa0')] # Remove ending cruft
            clean_tables.append(clean_table)

        return clean_tables

    def extract_lobbying_data(self):
        if self.soup.find('span', {'id': 'ContentPlaceHolder1_LRegistrationInfoReview1_lblIncidental'}):
            return pd.DataFrame()
        clean_tables = self.prep_tables()
        row_dicts = []

        for table in clean_tables:
            lobbyist_name = table[0].strip()
            client_name = table[2].strip()
            table_start_index = table.index('House / SenateBill Number or Agency NameBill title or activityAgent positionAmountDirect business association')+1
            table_data = table[table_start_index:]

            i=0
            while i <= len(table_data)-8:
                row_dicts.append({'LobbyingEntity': self.company_name,
                                'DateRange': self.date_range,
                                'Lobbyist': lobbyist_name,
                                'Client': client_name,
                                'House/Senate': table_data[i].strip(),
                                'BillNumber':table_data[i+1].strip(),
                                'BillActivity':table_data[i+2].strip(),
                                'AgentPosition': table_data[i+3].strip(),
                                'Amount': table_data[i+5].strip(),
                                'DirectBusinessAssosciation': table_data[i+7].strip()})
                i=i+8
        return pd.DataFrame(row_dicts)

    def extract_contributions_data(self):

        bad_data = [element.split("Lobbyist: ")[0] for element in self.soup.text.split('Campaign Contributions') if "DateLobbyist nameRecipient nameOffice soughtAmount" in element]
        if not bad_data:
            print("NO DATA")
        pass1 = [element.split('Total contributions')[0] for element in bad_data]
        pass2 = [element.split('soughtAmount\n\n')[1:][0] for element in pass1]
        pass3 = "".join(pass2)
        data = [element.strip() for element in pass3.split('\n') if element.strip()]

        i = 0
        row_dicts = []
        while i < len(data):
            date = data[i].split()[0]
            lobbyist = " ".join(data[i].split()[1:])
            recipient = data[i+1]
            office = data[i+2]
            amount = data[i+3]
            row_dicts.append({  'LobbyingEntity': self.company_name,
                                'DateRange': self.date_range,
                                'Date': date,
                                'LobbyistName': lobbyist,
                                'RecipientName': recipient,
                                'OfficeSought': office,
                                'Amount': amount})
            i=i+4

        return pd.DataFrame(row_dicts)

    def extract_compensation_data(self):
        compensation_table = self.soup.find('table', {'id': 'ContentPlaceHolder1_DisclosureReviewDetail1_grdvClientPaidToEntity'})
        if not bool(compensation_table):
            return pd.DataFrame()
        temp_list = [line.strip() for line in compensation_table.text.split('\n') if line.strip()][1:-2]

        temp_dict_list = []
        for entry in temp_list:
            if entry[0] != '$':
                client_name = entry
            else:
                temp_dict_list.append({'LobbyingEntity': self.company_name, 'DateRange':self.date_range, 'Client': client_name, 'Amount':entry})
        return pd.DataFrame(temp_dict_list)

    def save(self):
        if not self.lobbying_data.empty:
            self.write_data(LobbyingDataPage.lobbying_file, self.lobbying_data)
        if not self.compensation_data.empty:
            self.write_data(LobbyingDataPage.compensation_file, self.compensation_data)
        if not self.contributions_data.empty:
            self.write_data(LobbyingDataPage.contributions_file, self.contributions_data)

    def write_data(self, file_path, dataframe):
        write = True
        #if os.path.exists(file_path):
        with open(file_path, mode = 'a', encoding = 'utf-8') as f:
            for line in f:
                if self.company_name in line and self.date_range in line:
                    print('Data already present in ' + file_path)
                    write = False
                    break

        if write and type(dataframe) == pd.DataFrame:
            print('Saving data to ' + file_path)
            dataframe.to_csv(file_path, mode ='a+',header=(not os.path.exists(file_path)), index=False)

In [63]:
def extract_and_save(html_list):
    #for html in html_list:
        #LobbyingDataPage(html).save()
    for i in range(len(html_list)):
        print("Saving "+str(i))
        LobbyingDataPage(html_list[i]).save()

def pull_data(url):
    headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"}
    result = requests.get(url, headers=headers)
    result.raise_for_status()
    return result.content

def download_html_list(url_list):
    html_list = []
    for url in url_list:
        print("Pulling data from " + url)
        html_list.append(pull_data(url))
    return html_list

def save_data_from_url_list(url_list):
    disclosure_links = extract_and_save(download_html_list(url_list))
    html_list = download_html_list(disclosure_links)

In [64]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def extract_client_links(year):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    url = 'https://www.sec.state.ma.us/LobbyistPublicSearch/Default.aspx'

    driver.get(url)

    driver.find_element('id','ContentPlaceHolder1_rdbSearchByType').click()
    select = Select(driver.find_element(By.CLASS_NAME,'p3'))

    select.select_by_value(year)
    Select(driver.find_element('id','ContentPlaceHolder1_ucSearchCriteriaByType_drpType')).select_by_value('L')
    driver.find_element('id','ContentPlaceHolder1_btnSearch').click()

    find_table = driver.find_element(By.ID,'ContentPlaceHolder1_ucSearchResultByTypeAndCategory_grdvSearchResultByTypeAndCategory')
    links = find_table.find_elements(By.TAG_NAME,'a')
    links_list = [l.get_attribute('href') for l in links if str(l.get_attribute('href')).startswith('javascript') == False]
    driver.quit()
    return links_list

def extract_disclosures(list_of_links):
    disclosure_reports = []

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    for link in list_of_links:
    # print(link)
        driver.get(link)
        all_links = driver.find_elements(By.CLASS_NAME,'BlueLinks')
        disclosure_links = [l.get_attribute('href') for l in all_links if 'CompleteDisclosure' in l.get_attribute('href')]
        for dl in disclosure_links:
            disclosure_reports.append(dl)
    driver.quit()

    return disclosure_reports


In [65]:
disclosure_links = extract_disclosures(extract_client_links('2020'))

In [None]:
html_list = download_html_list(disclosure_links)

In [69]:
import pickle
with open("2020html.pkl", "wb") as f:
    pickle.dump(html_list, f)

In [None]:
for i in range(len(html_list)):
    ldp = LobbyingDataPage(html_list[i])
    print(ldp.company_name)
    print(ldp.compensation_data)
    print(ldp.contributions_data)
    print(ldp.lobbying_data)

In [None]:
html_list[6]

In [85]:
extract_and_save(html_list)

Saving 0
Saving 1
Saving 2
Saving 3
Saving 4
Saving 5
Saving 6
Saving 7
Saving 8
Saving 9
Saving 10
Saving 11
Saving 12
Saving 13
Saving 14
Saving 15
Saving 16
Saving 17
Saving 18
Saving 19
Saving 20
Saving 21
Saving 22
Saving 23
Saving 24
Saving 25
Saving 26
Saving 27
Saving 28
Saving 29
Saving 30
Saving 31
Saving 32
Saving 33
Saving 34
Saving 35
Saving 36
Saving 37
Saving 38
Saving 39
Saving 40
Saving 41
Saving 42
Saving 43
Saving 44
Saving 45
Saving 46
Saving 47
Saving 48
Saving 49
Saving 50
Saving 51
Saving 52
Saving 53
Saving 54
Saving 55
Saving 56
Saving 57
Saving 58
Saving 59
Saving 60
Saving 61
Saving 62
Saving 63
Saving 64
Saving 65
Saving 66
Saving 67
Saving 68
Saving 69
Saving 70
Saving 71
Saving 72
Saving 73
Saving 74
Saving 75
Saving 76
Saving 77
Saving 78
Saving 79
Saving 80
Saving 81
Saving 82
Saving 83
Saving 84
Saving 85
Saving 86
Saving 87
Saving 88
Saving 89
Saving 90
Saving 91
Saving 92
Saving 93
Saving 94
Saving 95
Saving 96
Saving 97
Saving 98
Saving 99
Saving 100

In [86]:
disclosure_links[45]

'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8ofjpjJ5oC9yekMB3UEgH8iaJQ382jQKJduWUUjX27ANT'

In [119]:
import pickle
with open("2005html.pkl", "rb") as f:
    html05 = pickle.load(f)
with open("2020html.pkl", "rb") as f:
    html20 = pickle.load(f)

In [9]:
errorpage = html05[48]

In [16]:
errorpage.decode('utf-8')

'\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n\r\n<html xmlns="http://www.w3.org/1999/xhtml" >\r\n<head><meta http-equiv="X-UA-Compatible" content="IE=edge" /><link href="/css/styleapps.css?version=060622" rel="stylesheet" type="text/css" meta="screen" /><link href="/css/topnav.css?version=060622" rel="stylesheet" type="text/css" meta="screen" /><link rel="shortcut icon" href="/images/favicon.gif" type="image/x-icon" />\r\n<script type="text/javascript" src="/js/p7TMMscripts.js"></script>\r\n<script type="text/javascript" src="/js/google_analytics.js"></script>\r\n<link href="/css/p7TMM04.css" rel="stylesheet" type="text/css" media="all" />\r\n\r\n<!--[if lt IE 8]><link rel="stylesheet" type="text/css" href="/css/lt8.css" /><![endif]-->\r\n<title>\r\n\tLobbyist Public Search\r\n</title><link href="css/StyleSheet.css" type="text/css" rel="stylesheet" />\r\n    <script language="javascript" src="../Javasc

In [39]:
import pickle
with open("indivlobbyist.html", "w") as f:
    f.write(html05[0].decode('utf-8'))

In [71]:
soup = bs(html_list[0], 'html.parser')

In [76]:
dfs[4][0][3].split('  Business name')[0].split('Lobbyist name  ')[1].replace("  ", " ")

IndexError: list index out of range

In [79]:
date_range = dfs[4][0][2].split(":  ")[1].replace("  ", " ")
lobbyist_name = dfs[4][0][3].split(' Business name')[0].split('Lobbyist name  ')[1].replace("  ", " ")

IndexError: list index out of range

In [78]:
date_range

'01/01/2020 - 06/30/2020'

In [80]:
for df in dfs:
    if len(df.columns) == 3 and (df.columns == ['Date', 'Activity or Bill No and Title', 'Client represented']).all():
        lobbying_data = df
    elif len(df.columns) == 4 and (df.columns == ['Date', 'Recipient name', 'Office sought', 'Amount']).all():
        contribution_data = df


In [81]:
lobbying_data
# TODO handle situation where activity or bill no and title is multiple things for one client
# basically do a new line for each bill

Unnamed: 0,Date,Activity or Bill No and Title,Client represented
0,1/05-6/28/05,"S.560, S.565, S.582, S.738, S.2042, S.2043, S....",Blue Cross Blue Shield of Massachusetts


In [80]:
dfs[9] = df
df.columns == ['Date', 'Activity or Bill No and Title', 'Client represented']

array([False, False, False])

In [None]:
class LobbyingDataPage:
    lobbying_file = 'lobbying/data/lobbying.csv'
    compensation_file = 'lobbying/data/compensation.csv'
    contributions_file = 'lobbying/data/contributions.csv'

    def __init__(self, html):
        self.html = html
        self.soup = bs(self.html,'html.parser')
        
        if self.isValid():
            self.extract_data()
        
        else:
            self.set_defaults()

    def isValid(self):
        if "An Error Occurred" in self.soup.text:
            return False
        return True

    def extract_data(self):
        self.company_name = self.get_company_name()
        self.date_range = self.get_date_range()
        self.lobbying_data = self.extract_lobbying_data()
        self.compensation_data = self.extract_compensation_data()
        self.contributions_data = self.extract_contributions_data()


In [89]:
test_entity_urls = ['https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oUX2F0/qMX8aZhXGSqISnPo81sWNBWPRVYkBCJOoiSOC',
    'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oSXfp14ycsC4C75XzUXuOD0RNTxP5RQlQYtqqNlG19gK',
    'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=Kce7BzXCV/xrL2hRhIeiyrKq4598/MmeOqNxcRw3anF8llP1KzXu6cA+wFHr/nIU',
    'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=Tcg7Il3rjW5sIbUrwbcVKYqHMk7FN1E+JyuG2w4SuGbSUM5P5U7i1R+Kl69eLgqM']

test_lobbyist_urls = ['https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oazP9bD0a9KMVAPrqT2Yinwr4JTgsyzaInIK/BXJHlV1',
    'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=eaiHDZ6kDM3fHlDyBbc8oWE66BrPrRKWkGd1M0SOekxiCPdVzrEEIQIimWwrunVO',
    'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=v/mjLQ41YVqm2bof1TANC8QjRgi8rX5lY/Ozmu5hJvE2+nv22rfxUQCNlsde/z4F',
    'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=qOH5OAu6URrG3qvY0KcrjT8Cd6HIk4OEVgmMDn8i9vU6n8cVsZ6PiBz3uD4tmhUG']

In [None]:
test_entity_htmls = download_html_list(test_entity_urls)
test_lobbyist_htmls = download_html_list(test_lobbyist_urls)
html_list

In [99]:
edf = pd.read_html(test_entity_htmls[0])
ldf = pd.read_html(test_lobbyist_htmls[0])

In [100]:
ldf[4]

Unnamed: 0,0
0,Disclosure reporting details
1,Back to search detail summary
2,Lobbyist disclosure reporting period: 01/01/2...
3,Lobbyist name Leda Anderson Business name ...
4,Operating Expenses No operating expenses were...


In [None]:
for i in range(len(html05)):
    df = pd.read_html(html05[i])
    print(i)
    is_entity = 'Entity' in df[4][0][2]
    is_valid = 'disclosure' in df[4][0][2]
    if not is_valid:
        print('HAHA WHAT')

In [156]:
dfs = pd.read_html(html05[0])
'An Error Occurred' in str(dfs[0][0])

False

In [None]:
html05[0]

In [168]:
for i in range(len(html05)):
    dfs = pd.read_html(html05[i])
    is_error = 'An Error Occurred' in str(dfs[0][0])
    if is_error:
        print(f'Error processing index {i}')
        #Return
    else: #Remove
        is_entity = 'Entity' in dfs[4][0][2]

Error processing index 48
Error processing index 54


In [None]:
# Header:
# Authorizing Officer name / Lobbyist name
# Title / NA
# Business name / Business name
# Address / Address
# City, state, zip code / Citym, state, zip code
# country / country
# NA / Agent Type

#TABLES:
# Entities
# Lobbyists
# Client Compensation
# Compensation/Salaries Paid
# Activities, Bill Numbers and Titles
# Operating Expenses
# Meals, Travel, and Entertainment Expenses
# Additional Expenses
# Campaign Contributions