In [76]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

class LobbyingDataPage:
    lobbying_file = '../data/lobbying.csv'
    compensation_file = '../data/compensation.csv'

    def __init__(self, html):
        self.html = html
        self.soup = bs(self.html,'html.parser')
        self.company_name = self.get_company_name()
        self.date_range = self.get_date_range()
        
        self.lobbying_data = self.extract_to_dataframe()
        self.compensation_data = self.extract_compensation_data()

    def save(self):
        self.write_data(LobbyingDataPage.lobbying_file, self.lobbying_data)
        self.write_data(LobbyingDataPage.compensation_file, self.compensation_data)

    def write_data(self, file_path, dataframe):
        write = True
        if os.path.exists(file_path):
            with open(file_path, mode = 'r') as f:
                for line in f:
                    if self.company_name in line and self.date_range in line:
                        print('Data already present in ' + file_path)
                        write = False
                        break
        if write:
            dataframe.to_csv(file_path, mode ='a',header=not os.path.exists(file_path), index=False)
            #with open(file_path, mode = 'w') as f:
            #    f.write(dataframe.to_csv(file_path, mode='a',header=not os.path.exists(file_path)))

    def get_date_range(self):
        return self.soup.find('span', {'id': 'ContentPlaceHolder1_lblYear'}).text

    def get_company_name(self):
        return self.soup.find('span', {'id': 'ContentPlaceHolder1_ERegistrationInfoReview1_lblEntityCompany'}).text

    def pull_data(self):
        headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"}
        result = requests.get(self.url, headers=headers)
        result.raise_for_status()
        return result.content

    def prep_tables(self):
        some_tables = self.soup.find_all('tr', {'style': 'vertical-align: top'})
        split_tables = [table for table in some_tables if 'Lobbyist: ' in table.text][0].text.split('Lobbyist: ')
        the_tables = [entry for entry in split_tables if entry.strip() and 'House / Senate' in entry]

        clean_tables = []
        for table in the_tables:
            clean_table = [line for line in table.split('\n') if line] # divide by lines and remove empties
            clean_table = clean_table[:clean_table.index('\xa0\xa0\xa0')] # Remove ending cruft
            clean_tables.append(clean_table)

        return clean_tables
    
    def extract_to_dataframe(self):
        good_soup = self.prep_tables()
        row_dicts = []
        for table in good_soup:
            lobbyist_name = table[0]
            client_name = table[2]
            table_start_index = table.index('House / SenateBill Number or Agency NameBill title or activityAgent positionAmountDirect business association')+1
            table_data = table[table_start_index:]
            i=0
            while i <= len(table_data)-7:
                row_dicts.append({'LobbyingEntity': self.company_name,
                                'DateRange': self.date_range,
                                'Lobbyist': lobbyist_name, 
                                'Client': client_name, 
                                'House/Senate': table_data[i].strip(), 
                                'BillNumber':table_data[i+1].strip(), 
                                'BillActivity':table_data[i+2].strip(),
                                'AgentPosition': table_data[i+3].strip(), 
                                'Amount': table_data[i+5].strip(), 
                                'DirectBusinessAssosciation': table_data[i+7].strip()})
                i=i+8

        return pd.DataFrame(row_dicts)

    def extract_compensation_data(self):
        compensation_table = self.soup.find('table', {'id': 'ContentPlaceHolder1_DisclosureReviewDetail1_grdvClientPaidToEntity'})
        temp_list = [line.strip() for line in compensation_table.text.split('\n') if line.strip()][1:-2]
        temp_dict_list = []
        it = iter(temp_list)
        for i in it:
            temp_dict_list.append({'LobbyingEntity': self.company_name, 'DateRange':self.date_range, 'Client': i, 'Amount':next(it)})

        return pd.DataFrame(temp_dict_list)    

In [80]:
def extract_and_save(html_list):
    for html in html_list:
        LobbyingDataPage(html).save()

def pull_data(url = testing_url):
    headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"}
    result = requests.get(url, headers=headers)
    result.raise_for_status()
    return result.content

def download_html_list(url_list):
    html_list = []
    for url in url_list:
        html_list.append(pull_data(url))
    return html_list

def save_data_from_url_list(url_list):
    extract_and_save(download_html_list(url_list))

In [79]:
url_list = ['https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=Tcg7Il3rjW5sIbUrwbcVKSmBfkLmYFdnf8xTcXfPeaK3i269EErp97So5FUc/X23',
            'https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=Tcg7Il3rjW5sIbUrwbcVKV8k15XmkYG2WIUeIEwL/REZBWdbh+QzQDTofQB05tTR']