# Scraping classwork

Steps we do only once
- Create a folder to save HTML
- Make dataframe for bills

Steps to repeat in a `for` loop:
- Request the URL
- Save the HTML of the URL
- Parse the page with bs4
- Find and get what's inside `id='billTextContainer'`
- Clean up the bill text
  - Replace punctuation with space
  - Replace newlines with space
  - Replace multiple spaces with one space
- Get the word count
- Save the word count into the dataframe

Finally, let's output the results of the dataframe to a csv.

In [1]:
import json
import requests
from bs4 import BeautifulSoup
import re
import string
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## Create `pages` folder to save HTML

In [2]:
!mkdir -p pages

## Import bills data

In [3]:
with open('bills.json') as file:
    bills = json.load(file)

## Create dataframe

In [4]:
bills_df = pd.DataFrame(bills)
bills_df['word_count'] = np.nan
bills_df

Unnamed: 0,congress,chamber,bill_url,bill_number,word_count
0,116,house,https://www.congress.gov/bill/116th-congress/h...,133,
1,116,house,https://www.congress.gov/bill/116th-congress/h...,150,
2,116,house,https://www.congress.gov/bill/116th-congress/h...,251,
3,116,house,https://www.congress.gov/bill/116th-congress/h...,259,
4,116,house,https://www.congress.gov/bill/116th-congress/h...,263,
5,116,house,https://www.congress.gov/bill/116th-congress/h...,266,
6,116,house,https://www.congress.gov/bill/116th-congress/h...,276,
7,116,house,https://www.congress.gov/bill/116th-congress/h...,299,
8,116,house,https://www.congress.gov/bill/116th-congress/h...,430,
9,116,house,https://www.congress.gov/bill/116th-congress/h...,434,


## Scrape and parse `bills`

In [5]:
punctuation_table = str.maketrans({key: ' ' for key in string.punctuation})

In [6]:
for bill in tqdm(bills):
    bill_url = bill['bill_url']
    bill_number = bill['bill_number']
    
    # Request the URL
    page = requests.get(bill_url)
    
    # Save the HTML of the URL
    with open(f'pages/page_{ bill_number }.html', 'w') as file:
        # pages/page_133.html
        file.write(page.text)
    
    # Parse the page with bs4
    soup = BeautifulSoup(page.text, features='html.parser')
    
    # Find and get what's inside `id='billTextContainer'`
    bill_text_container = soup.find(id='billTextContainer')
    bill_text = bill_text_container.get_text()
    
    # Clean up the bill text
    
    # Replace punctuation with space
    bill_text_cleaned = bill_text.translate(punctuation_table)    
    
    # Replace newlines with space
    bill_text_cleaned = re.sub('\\n', ' ', bill_text_cleaned)
    
    # Replace multiple spaces with one space
    bill_text_cleaned = re.sub('\s{2,}', ' ', bill_text_cleaned)    
    
    # Get the word count
    bill_word_count = len(bill_text_cleaned.split())
    
    # Save the word count into the dataframe
    bills_df.loc[bills_df['bill_number'] == bill_number, 'word_count'] = bill_word_count

  0%|          | 0/40 [00:00<?, ?it/s]

## Export the data

In [7]:
bills_df.to_csv('bills.csv', index=False)

In [8]:
bills_df

Unnamed: 0,congress,chamber,bill_url,bill_number,word_count
0,116,house,https://www.congress.gov/bill/116th-congress/h...,133,967689.0
1,116,house,https://www.congress.gov/bill/116th-congress/h...,150,2307.0
2,116,house,https://www.congress.gov/bill/116th-congress/h...,251,194.0
3,116,house,https://www.congress.gov/bill/116th-congress/h...,259,1065.0
4,116,house,https://www.congress.gov/bill/116th-congress/h...,263,696.0
5,116,house,https://www.congress.gov/bill/116th-congress/h...,266,4824.0
6,116,house,https://www.congress.gov/bill/116th-congress/h...,276,752.0
7,116,house,https://www.congress.gov/bill/116th-congress/h...,299,4824.0
8,116,house,https://www.congress.gov/bill/116th-congress/h...,430,198.0
9,116,house,https://www.congress.gov/bill/116th-congress/h...,434,259.0


In [9]:
bills_df.sort_values(by=['word_count'], ascending=False)

Unnamed: 0,congress,chamber,bill_url,bill_number,word_count
0,116,house,https://www.congress.gov/bill/116th-congress/h...,133,967689.0
19,116,house,https://www.congress.gov/bill/116th-congress/h...,748,155544.0
35,116,house,https://www.congress.gov/bill/116th-congress/h...,1158,105688.0
5,116,house,https://www.congress.gov/bill/116th-congress/h...,266,4824.0
7,116,house,https://www.congress.gov/bill/116th-congress/h...,299,4824.0
1,116,house,https://www.congress.gov/bill/116th-congress/h...,150,2307.0
10,116,house,https://www.congress.gov/bill/116th-congress/h...,439,1936.0
31,116,house,https://www.congress.gov/bill/116th-congress/h...,1058,1884.0
29,116,house,https://www.congress.gov/bill/116th-congress/h...,943,1859.0
14,116,house,https://www.congress.gov/bill/116th-congress/h...,559,1338.0


In [11]:
bills_df[bills_df['word_count'] >= 100000]

Unnamed: 0,congress,chamber,bill_url,bill_number,word_count
0,116,house,https://www.congress.gov/bill/116th-congress/h...,133,967689.0
19,116,house,https://www.congress.gov/bill/116th-congress/h...,748,155544.0
35,116,house,https://www.congress.gov/bill/116th-congress/h...,1158,105688.0


In [13]:
bills_df.iloc[35]['bill_url']

'https://www.congress.gov/bill/116th-congress/house-bill/1158/text?r=1&s=3'

In [15]:
bills_df.iloc[19]['bill_url']

'https://www.congress.gov/bill/116th-congress/house-bill/748/text?r=1&s=3'

In [16]:
bills_df['word_count'].mean()

31569.075

In [17]:
bills_df['word_count'].median()

621.5

In [19]:
bills_df['word_count'].max()

967689.0