In [19]:
import json
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup

from tqdm import tqdm  # Import tqdm for the progress bar


# api key for congress.gov
from api_key import congress_api_key

## Scraping with BeautifulSoup

In [4]:
# create dataframe to contain texts
congress_115_text_df = pd.read_csv('congress_115_hr_urls.csv')

In [6]:
# test scrape for scraping text from url with BeautifulSoup
url =  congress_115_text_df['TextUrl'][501]
result = requests.get(url)
content = result.text

soup = BeautifulSoup(content, 'lxml')
soup_string = str(soup)



In [7]:
len(congress_115_text_df)

2255

In [16]:
null_indices = congress_115_text_df[congress_115_text_df['TextUrl'].isnull()].index
null_indices

Int64Index([], dtype='int64')

In [60]:
# test loop for scraping text with BeautifulSoup
test_text = []


for i in range(0, 5):
    url =  congress_115_hr_df['TextUrl'][i]
    result = requests.get(url)
    content = result.text

    soup = BeautifulSoup(content, 'lxml')
    soup_string = str(soup)
    test_text.append(soup_string)

test_text_df = pd.DataFrame(test_text)
test_text_df

Unnamed: 0,0
0,<html><body><pre>\n[Congressional Bills 115th ...
1,<html><body><pre>\n[Congressional Bills 115th ...
2,<html><body><pre>\n[Congressional Bills 115th ...
3,<html><body><pre>\n[Congressional Bills 115th ...
4,<html><body><pre>\n[Congressional Bills 115th ...


In [None]:
# ChatGPT suggested edit

# Create a new 'Text' column to store the scraped text
congress_115_text_df['Text'] = ""

#create list to store index of rows that do not run in loop
error_index = []

# Iterate over rows using iterrows
for index, row in congress_115_text_df.iterrows():
    url = row['TextUrl']

    try:
        result = requests.get(url)
        result.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        content = result.text
        soup = BeautifulSoup(content, 'lxml')
        soup_string = str(soup)

        # Store the scraped text in the 'Text' column
        congress_115_text_df.at[index, 'Text'] = soup_string
        print(f"Scraped data for row {index}")

    except Exception as e:
        error_index.append(index)
        print(f"Failed to fetch data for row {index}: {e}")


In [18]:
congress_115_text_df.head()

Unnamed: 0,BillNumber,Summary,TextUrl,Text
0,5,(This measure has not been amended since it wa...,https://www.congress.gov/115/bills/hr5/BILLS-1...,
1,7,(This measure has not been amended since it wa...,https://www.congress.gov/115/bills/hr7/BILLS-1...,
2,10,Financial CHOICE Act of 2017\n\n (Sec. 2) This...,https://www.congress.gov/115/bills/hr10/BILLS-...,
3,15,Raise the Wage Act\n\nThis bill amends the Fai...,https://www.congress.gov/115/bills/hr15/BILLS-...,
4,19,Smithsonian Women's History Museum Act\n\nThis...,https://www.congress.gov/115/bills/hr19/BILLS-...,


In [20]:

# Create a new 'Text' column to store the scraped text
congress_115_text_df['Text'] = ""

# Create a list to store the index of rows that do not run in the loop
error_index = []

# Use tqdm to display a progress bar
for index, row in tqdm(congress_115_text_df.iterrows(), total=len(congress_115_text_df), desc='Processing Rows'):

    url = row['TextUrl']

    try:
        result = requests.get(url)
        result.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        content = result.text
        soup = BeautifulSoup(content, 'lxml')
        soup_string = str(soup)

        # Store the scraped text in the 'Text' column
        congress_115_text_df.at[index, 'Text'] = soup_string

    except requests.exceptions.RequestException as e:
        error_index.append(index)
        
    time.sleep(2) 

# Display the indices of rows with errors
print("Rows with errors:", error_index)


Processing Rows:   1%|▏                | 23/2255 [02:43<4:23:49,  7.09s/it]


KeyboardInterrupt: 

In [23]:

# Create a new 'Text' column to store the scraped text
congress_115_text_df['Text'] = ""

# Create a list to store the index of rows that do not run in the loop
error_index = []

# Use tqdm to display a progress bar
for index, row in tqdm(congress_115_text_df.iterrows(), total=len(congress_115_text_df), desc='Processing Rows'):

    url = row['TextUrl']

    try:
        # Set a timeout of 5 seconds
        result = requests.get(url, timeout=5)
        result.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        content = result.text
        soup = BeautifulSoup(content, 'lxml')
        soup_string = str(soup)

        # Store the scraped text in the 'Text' column
        congress_115_text_df.at[index, 'Text'] = soup_string

    except requests.exceptions.RequestException as e:
        error_index.append(index)

    except requests.exceptions.Timeout as e:
        # Handle timeout exception
        print(f"Timeout for row {index}: {e}")
        error_index.append(index)

    time.sleep(2)  # Adjust the sleep time as needed

# Display the indices of rows with errors
print("Rows with errors:", error_index)


Processing Rows: 100%|███████████████| 2255/2255 [1:44:44<00:00,  2.79s/it]

Rows with errors: [1, 49, 79, 83, 118, 140, 158, 181, 198, 281, 320, 397, 441, 452, 475, 500, 536, 551, 698, 713, 734, 753, 768, 769, 874, 888, 937, 949, 961, 973, 1043, 1077, 1099, 1124, 1179, 1239, 1284, 1310, 1333, 1337, 1342, 1401, 1458, 1493, 1560, 1632, 1722, 1791, 1813, 1839, 1857, 1885, 1923, 1953, 2060, 2069, 2125, 2151, 2181, 2205, 2206]





In [24]:
len(error_index)

61

In [25]:
# Create a new error list to store the index of rows that do not run in the second loop
error_index_1 = []

# Use tqdm to display a progress bar
for i in error_index:

    # Access the corresponding row in the DataFrame
    error_row = congress_115_text_df.iloc[i]
    
    url = error_row['TextUrl']

    try:
        # Set a timeout of 5 seconds
        result = requests.get(url, timeout=5)
        result.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        content = result.text
        soup = BeautifulSoup(content, 'lxml')
        soup_string = str(soup)

        # Store the scraped text in the 'Text' column
        congress_115_text_df.at[i, 'Text'] = soup_string

    except requests.exceptions.RequestException as e:
        error_index_1.append(i)

    except requests.exceptions.Timeout as e:
        # Handle timeout exception
        print(f"Timeout for row {i}: {e}")
        error_index_1.append(i)

    time.sleep(2)  # Adjust the sleep time as needed

# Display the indices of rows with errors
print("Rows with errors:", error_index_1)

Rows with errors: [83, 949, 1953]


In [26]:
# Create a new error list to store the index of rows that do not run in the second loop
error_index_2 = []

# Use tqdm to display a progress bar
for i in error_index_1:

    # Access the corresponding row in the DataFrame
    error_row = congress_115_text_df.iloc[i]
    
    url = error_row['TextUrl']

    try:
        # Set a timeout of 5 seconds
        result = requests.get(url, timeout=5)
        result.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        content = result.text
        soup = BeautifulSoup(content, 'lxml')
        soup_string = str(soup)

        # Store the scraped text in the 'Text' column
        congress_115_text_df.at[i, 'Text'] = soup_string

    except requests.exceptions.RequestException as e:
        error_index_2.append(i)

    except requests.exceptions.Timeout as e:
        # Handle timeout exception
        print(f"Timeout for row {i}: {e}")
        error_index_2.append(i)

    time.sleep(2)  # Adjust the sleep time as needed

# Display the indices of rows with errors
print("Rows with errors:", error_index_2)

Rows with errors: []


In [42]:
congress_115_text_df['BillNumber'][1600]

1854

In [43]:
congress_115_text_df['Summary'][1600]

'Prescription Drug Monitoring Act of 2017\n\nThis bill requires a state that receives grant funds under the prescription drug monitoring program (PDMP) or the controlled substance monitoring program to comply with specified requirements, including a requirement to share its PDMP data with other states. The Department of Justice (DOJ) or Department of Health and Human Services may withhold grant funds from a state that fails to comply.\n\n To facilitate data sharing among states, the bill directs DOJ to award a grant under the Comprehensive Opioid Abuse Grant Program to establish and maintain a data-sharing hub.'

In [44]:
congress_115_text_df['Text'][1600]

"<html><body><pre>\n[Congressional Bills 115th Congress]\n[From the U.S. Government Publishing Office]\n[H.R. 1854 Introduced in House (IH)]\n\n&lt;DOC&gt;\n\n\n\n\n\n\n115th CONGRESS\n  1st Session\n                                H. R. 1854\n\n  To require the use of prescription drug monitoring programs and to \n              facilitate information sharing among States.\n\n\n_______________________________________________________________________\n\n\n                    IN THE HOUSE OF REPRESENTATIVES\n\n                             April 3, 2017\n\n    Mr. Jenkins of West Virginia (for himself and Mr. Ryan of Ohio) \n introduced the following bill; which was referred to the Committee on \nEnergy and Commerce, and in addition to the Committee on the Judiciary, \nfor a period to be subsequently determined by the Speaker, in each case \nfor consideration of such provisions as fall within the jurisdiction of \n                        the committee concerned\n\n_________________________

In [45]:
congress_115_text_df.to_csv('congress_115_text.csv', index=False)