In [1]:
# Import Python libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://www.parl.ca/legisinfo/en/bill/"

# Define bill object
class Bill:
    def __init__(self, id_, name = None, title = None, constituency = None):
        self.id_ = id_
        self.name = name
        self.title = title
        self.constituency = constituency
        
    def __str__(self):
        output = "\n".join([
            "ID: {id_}", 
            "Name: {name}",
            "Title: {title}",
            "Constituency: {constituency}"
        ]).format(
            id_ = self.id_,
            name = self.name,
            title = self.title,
            constituency = self.constituency
        )  
        return output

# Helper function to get names from the link URL
def linkParser(link):
    link = link.split('en/')[1].split('(')[0]
    link = ' '.join(link.split('-'))
    return link.title()

# Helper function to remove titles (i.e. Senator or Honorable)
def titleParser(name):
    name = name.replace('Sen.', '')
    name = name.replace('Hon.', '')
    return name.strip()
            
def getSponsorInfo(id_):
    
    output = Bill(id_)
    
    try:
        page = requests.get(url + id_)
        soup = BeautifulSoup(page.content, "html.parser")
        link = soup.find('section', class_ = "bill-identity").find('a')
        text = link.text
        href = link['href']
        
    except:
        print(output)
        return output
    
    # Assigns default title
    output.title = 'Member of Parliament' if 'C' in id_ else 'Senator'
        
    # File only Case
    if 'Text of the bill' in text:
        print(output)
        return output
    
    # External Case
    if text == 'Leader of the Government in the Senate':
        output.title = text
        try:
            senator_page = requests.get('https:' + link['href'])  
            soup = BeautifulSoup(senator_page.content, "html.parser")
            header = soup.find('div', class_ = "sc-senator-bio-senatorheader").find('h1')
            output.name = header.text
            
        except: 
            print(output)
            return output
    
        if 'Senator' in output.name:
            output.name = output.name[8:].split(',')[0].strip()
        
        print(output)
        return output
    
    # Link Case
    titles = ['Prime Minister', 'Minister', 'Leader of Government', 'Leader of the Government', 'Solicitor General', 
              'President of the Treasury Board', 'Secretary of State', 'President of the Queen\'s Privy Council']

    if any(title in text for title in titles):
        output.title = text
        output.name = linkParser(href)
        
        print(output)
        return output
    
    # Name Case
    text = text.split('(')
    output.name = titleParser(text[0])
    
    if len(text) > 1 and len(text[1]) > 1:
        output.constituency = text[1][:-1].strip()

    print(output)
    return output

In [3]:
# Read in the Id column from cleaned_data.csv to get the bills for webscraping
data = pd.read_csv('../data/processed/bills_processed.csv')[['Id']]

# Get sponsor information on the bills listed in cleaned_data.csv
data['SponsorInfo'] = data.apply(lambda x: getSponsorInfo(x.Id), axis = 1)

In [2]:
data.to_csv('../data/raw/sponsors.csv', index = False) 