In [1]:
import helper
from datetime import datetime, timedelta
import pandas as pd

In [74]:
# Main function to calculate working days
def workingdays(start="09/01/2024", end=None):    
    datestart = datetime.strptime(start, '%m/%d/%Y').date()

    if end is None:
        dateend = datetime.now().date()       
    else:
        dateend = datetime.strptime(end, '%m/%d/%Y').date()

    rows = []  # Use a list to accumulate rows
    chambers = ["Senate", "House"]

    while datestart <= dateend:
        datestart, url, tree = helper.web_test(datestart, dateend)

        # next valid webpage may be past dateend, break if so
        if datestart > dateend:
            break
        
        # for each chamber (House and Senate)
        for chamber in chambers:
            date = datestart.strftime("%m/%d/%Y")

            # Initialize values for the day
            day_data = {
                "House or Senate": chamber,
                "Date": date,
                "Time Convened": None,
                "Time Adjourned": None,
                "Time in Session": None,
                "Working Day?": "INPUT BY HAND",
                "Scraped adjournment string":None,
                "url":url
            }

            # # checks only before next h2 (breaks up page by chamber)
            # senate_h2_addition = "[following-sibling::h2[contains(text(), 'House')]]"

            # if chamber == "House":
            #     senate_h2_addition = ""

            xpaths = [
                f"//center[h2[contains(text(), '{chamber}')]]//following-sibling::p[strong='Adjournment:'][contains(text(),{chamber})]//text()",
                f"//center[h2[contains(text(), '{chamber}')]]//following-sibling::p/text()",
                f"//center[h2[contains(text(), '{chamber}')]]//following-sibling::p[contains(translate(text(), 'ADJOURN', 'adjourn'), 'adjourn')]//text()",
                f"//center[h2[contains(text(), '{chamber}')]]//following-sibling::p[contains(text(),'in session')]//text()"
            ]
            
            for xp in xpaths:
                try:
                    strg = helper.clean_string("".join(tree.xpath(xp)[0:2]))
                except Exception as e:
                    print("Error with xpath: ",e)
                        
                day_data["Scraped adjournment string"] = strg

                # Not in session
                if "not in session" in strg:
                    day_data["Time Convened"] = None
                    day_data["Time Adjourned"] = None
                    day_data["Working Day?"] = "NS"

                # Pro forma
                elif "pro forma" in strg:
                        day_data["Time Convened"] = None
                        day_data["Time Adjourned"] = None
                        day_data["Working Day?"] = "pf"
                        
                # try and extrac times
                else:
                    try:
                        day_data = helper.return_times_from_string(strg,day_data)
                         
                        if day_data["Time Convened"] and day_data["Time Adjourned"]:

                            #minutes
                            timein = (day_data["Time Adjourned"] - day_data["Time Convened"]).total_seconds() / 60
                            day_data["Time in Session"] = timein

                            if timein >= 60:
                                day_data["Working Day?"] = "x"
                            elif timein > 0:
                                day_data["Working Day?"] = "pf"

                            day_data["Time Convened"] = day_data["Time Convened"].strftime("%H:%M%p")
                            day_data["Time Adjourned"] = day_data["Time Adjourned"].strftime("%H:%M%p")

                    except Exception as e:
                        print("Issue exracting times, string: ",strg,"\nError:",e)
                                
                if day_data["Working Day?"] !=  "INPUT BY HAND":
                    break

            # Append daily results as a dictionary to the rows list
            rows.append(day_data)

        # Add one day to the loop
        datestart += timedelta(days=1)

    # Convert the accumulated rows (list of dictionaries) into a DataFrame
    df = pd.DataFrame(rows, columns=day_data.keys())
    
    return df

In [77]:
df = workingdays(start="01/22/2022",end="01/31/2022")

https://www.congress.gov/congressional-record/2022/1/22/daily-digest
https://www.congress.gov/congressional-record/2022/1/25/daily-digest
https://www.congress.gov/congressional-record/2022/1/26/daily-digest
https://www.congress.gov/congressional-record/2022/1/28/daily-digest
https://www.congress.gov/congressional-record/2022/1/29/daily-digest


In [78]:
df

Unnamed: 0,House or Senate,Date,Time Convened,Time Adjourned,Time in Session,Working Day?,Scraped adjournment string,url
0,Senate,01/24/2022,,,,pf,The Senate met at 1:00:29 p.m. in pro forma se...,https://www.congress.gov/congressional-record/...
1,House,01/24/2022,,,,NS,The House was not in session today. The House ...,https://www.congress.gov/congressional-record/...
2,Senate,01/25/2022,,,,NS,The Senate was not in session and stands adjou...,https://www.congress.gov/congressional-record/...
3,House,01/25/2022,10:00AM,10:03AM,3.0,pf,Adjournment: The House met at 10 a.m. and adjo...,https://www.congress.gov/congressional-record/...
4,Senate,01/27/2022,,,,pf,The Senate met at 10:00:05 a.m. in pro forma s...,https://www.congress.gov/congressional-record/...
5,House,01/27/2022,,,,NS,The House was not in session today. The House ...,https://www.congress.gov/congressional-record/...
6,Senate,01/28/2022,,,,NS,The Senate was not in session and stands adjou...,https://www.congress.gov/congressional-record/...
7,House,01/28/2022,09:00AM,09:03AM,3.0,pf,Adjournment: The House met at 9 a.m. and adjou...,https://www.congress.gov/congressional-record/...
8,Senate,01/31/2022,15:00PM,18:49PM,229.0,x,Adjournment: Senate convened at 3 p.m. and adj...,https://www.congress.gov/congressional-record/...
9,House,01/31/2022,,,,NS,The House was not in session today. The House ...,https://www.congress.gov/congressional-record/...


In [15]:
start="01/21/2022"
end="01/22/2022"

datestart = datetime.strptime(start, '%m/%d/%Y').date()

if end is None:
    dateend = datetime.now().date()       
else:
    dateend = datetime.strptime(end, '%m/%d/%Y').date()


datestart, url, tree = helper.web_test(datestart, dateend)

https://www.congress.gov/congressional-record/2022/1/21/daily-digest


In [47]:
chamber = "Senate"

# checks only before next h2 (breaks up page by chamber)
senate_h2_addition = "[following-sibling::h2[contains(text(), 'House')]]"

if chamber == "House":
    senate_h2_addition = ""

xpaths = [
    f"//center[h2[contains(text(), '{chamber}')]]//following-sibling::p[strong='Adjournment:']//text()",
    f"//center[h2[contains(text(), '{chamber}')]]/following-sibling::p//text()",
    f"//center[h2[contains(text(), '{chamber}')]]//following-sibling::p[contains(translate(text(), 'ADJOURN', 'adjourn'), 'adjourn')]//text()",
    f"//center[h2[contains(text(), '{chamber}')]]//following-sibling::p[contains(text(),'in session')]//text()"
]

In [50]:
for xp in xpaths:
    try:
        
        print(strg)
    except:
        try:
            strg = helper.clean_string(str("".join(tree.xpath(xp))))
        except Exception as e:
            print("Error with xpath: ",e)
            

Adjournment:
The Senate was not in session and stands adjourned until 1 p.m. on Monday January 24 2022.––––––
The Senate was not in session and stands adjourned until 1 p.m. on Monday January 24 2022.––––––
The Senate was not in session and stands adjourned until 1 p.m. on Monday January 24 2022.––––––


In [6]:
pd.read_csv("220115_to_220215.csv")

Unnamed: 0.1,Unnamed: 0,House or Senate,Date,Time Convened,Time Adjourned,Time in Session,Working Day?,Scraped adjournment string,url
0,0,Senate,01/18/2022,,1900-01-01 22:01:00,,,Adjournment: Senate convened at 12 noon and r...,https://www.congress.gov/congressional-record/...
1,1,House,01/18/2022,1900-01-01 12:00:00,1900-01-01 20:21:00,501.0,x,Adjournment: The House met at 12 p.m. and adj...,https://www.congress.gov/congressional-record/...
2,2,Senate,01/19/2022,1900-01-01 10:00:00,1900-01-01 22:51:00,771.0,x,Adjournment: Senate convened at 10 a.m. and a...,https://www.congress.gov/congressional-record/...
3,3,House,01/19/2022,1900-01-01 10:00:00,1900-01-01 16:02:00,362.0,x,Adjournment: The House met at 10 a.m. and adj...,https://www.congress.gov/congressional-record/...
4,4,Senate,01/20/2022,1900-01-01 11:00:00,1900-01-01 16:39:00,339.0,x,Adjournment: Senate convened at 11 a.m. and a...,https://www.congress.gov/congressional-record/...
5,5,House,01/20/2022,1900-01-01 09:00:00,1900-01-01 14:14:00,314.0,x,Adjournment: The House met at 9 a.m. and adjo...,https://www.congress.gov/congressional-record/...
6,6,Senate,01/21/2022,,,,,Adjournment: The House met at 9 a.m. and adjo...,https://www.congress.gov/congressional-record/...
7,7,House,01/21/2022,1900-01-01 09:00:00,1900-01-01 09:03:00,3.0,pf,Adjournment: The House met at 9 a.m. and adjo...,https://www.congress.gov/congressional-record/...
8,8,Senate,01/24/2022,,,,,,https://www.congress.gov/congressional-record/...
9,9,House,01/24/2022,,,,,,https://www.congress.gov/congressional-record/...


In [5]:
df

Unnamed: 0,House or Senate,Date,Time Convened,Time Adjourned,Time in Session,Working Day?,Scraped adjournment string,url
0,Senate,01/21/2022,NaT,NaT,,,Adjournment: The House met at 9 a.m. and adjo...,https://www.congress.gov/congressional-record/...
1,House,01/21/2022,1900-01-01 09:00:00,1900-01-01 09:03:00,3.0,pf,Adjournment: The House met at 9 a.m. and adjo...,https://www.congress.gov/congressional-record/...


In [10]:
    results = df
    dfhouse = df
    df = df[df["House or Senate"]=="Senate"]
    Q1 = df[str("01/01/"+datestart.strftime("%Y"))<df['Date']]
    Q1 = Q1[str("03/31/"+datestart.strftime("%Y"))>=Q1['Date']]
    Q2 = df[str("03/31/"+datestart.strftime("%Y"))<df['Date']]
    Q2 = Q2[str("06/30/"+datestart.strftime("%Y"))>=Q2['Date']]
    Q3 = df[str("06/30/"+datestart.strftime("%Y"))<df['Date']]
    Q3 = Q3[str("09/30/"+datestart.strftime("%Y"))>=Q3['Date']]
    Q4 = df[str("09/30/"+datestart.strftime("%Y"))<df['Date']]
    Q4 = Q4[str("12/31/"+datestart.strftime("%Y"))>=Q4['Date']]
    ls = [len(Q1[Q1["Working Day?"] == "x"]),len(Q2[Q2["Working Day?"] == "x"]),len(Q3[Q3["Working Day?"] == "x"]),len(Q4[Q4["Working Day?"] == "x"])]

    dfhouse = dfhouse[dfhouse["House or Senate"]=="House"]
    Q1 = dfhouse[str("01/01/"+datestart.strftime("%Y"))<dfhouse['Date']]
    Q1 = Q1[str("03/31/"+datestart.strftime("%Y"))>=Q1['Date']]
    Q2 = dfhouse[str("03/31/"+datestart.strftime("%Y"))<dfhouse['Date']]
    Q2 = Q2[str("06/30/"+datestart.strftime("%Y"))>=Q2['Date']]
    Q3 = dfhouse[str("06/30/"+datestart.strftime("%Y"))<dfhouse['Date']]
    Q3 = Q3[str("09/30/"+datestart.strftime("%Y"))>=Q3['Date']]
    Q4 = dfhouse[str("09/30/"+datestart.strftime("%Y"))<dfhouse['Date']]
    Q4 = Q4[str("12/31/"+datestart.strftime("%Y"))>=Q4['Date']]
    lsH = [len(Q1[Q1["Working Day?"] == "x"]),len(Q2[Q2["Working Day?"] == "x"]),len(Q3[Q3["Working Day?"] == "x"]),len(Q4[Q4["Working Day?"] == "x"])]
    if len(ls) < len(lsH):
        for i in range(len(lsH)-len(ls)):
            ls.append(None)
    elif len(lsH) < len(ls):
        for i in range(len(ls)-len(lsH)):
            lsH.append(None)
    summary = pd.DataFrame(zip(ls,lsH),index=["Q1","Q2","Q3","Q4"],columns=["Working Days - Senate","Working Days - House"]) 
    l = pd.DataFrame([[start,dateend.strftime("%m/%d/%Y")]],columns=["Working Days - Senate","Working Days - House"],index=["Date Range"])
    summary = summary.append(l)


NameError: name 'df' is not defined

In [3]:
print()


NameError: name 'CONGRESS_API_KEY' is not defined

In [1]:
url = data['dailyCongressionalRecord'][0]['url'].replace('?format=json','/articles?api_key=UEcKdB7WGClamPX6uHFbVQq6GFaFjjbn3fTdKcDy')
print(url)
response = requests.get(url)
response.json()


NameError: name 'data' is not defined

In [42]:
import pandas as pd
# pd.DataFrame(response.json()['articles']['Daily Digest'])

response.json()['articles']
for article in response.json()['articles']:
    if article['name'] == 'Daily Digest':
        display(article)
        #filter now where title includes "Daily Digest/Senate"
        if article['title'] includes 'Daily Digest/Senate':
            display(article)
        break
    else:
        continue





{'name': 'Daily Digest',
 'sectionArticles': [{'endPage': 'D804',
   'startPage': 'D803',
   'text': [{'type': 'Formatted Text',
     'url': 'https://www.congress.gov/118/crec/2024/07/31/170/125/modified/CREC-2024-07-31-pt1-PgD803-5.htm'},
    {'type': 'PDF',
     'url': 'https://www.congress.gov/118/crec/2024/07/31/170/125/CREC-2024-07-31-pt1-PgD803-5.pdf'}],
   'title': 'Daily Digest/The CONGRESSIONAL RECORD (USPS 087-390).; Congressional Record Vol. 170, No. 125'},
  {'endPage': 'D803',
   'startPage': 'D803',
   'text': [{'type': 'Formatted Text',
     'url': 'https://www.congress.gov/118/crec/2024/07/31/170/125/modified/CREC-2024-07-31-pt1-PgD803-4.htm'},
    {'type': 'PDF',
     'url': 'https://www.congress.gov/118/crec/2024/07/31/170/125/CREC-2024-07-31-pt1-PgD803-4.pdf'}],
   'title': 'Daily Digest/COMMITTEE MEETINGS FOR 2024-08-01; Congressional Record Vol. 170, No. 125'},
  {'endPage': 'D803',
   'startPage': 'D803',
   'text': [{'type': 'Formatted Text',
     'url': 'https:/

In [11]:
import requests
from bs4 import BeautifulSoup
import os

# URL of the page containing the PDFs
url = "https://www.govinfo.gov/app/collection/crec/"

# Function to download a file
def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename
# Get the content of the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)
# Find all rows in the table
rows = soup.find_all(class_='table')
print(rows)

# Create a directory to save the PDFs
if not os.path.exists('pdfs'):
    os.makedirs('pdfs')

# Iterate through rows to find the "House Section" PDFs
for row in rows:
    if 'House Section' in row.text:
        pdf_link = row.find('a', href=True)['href']
        full_url = f"https://www.govinfo.gov{pdf_link}"
        local_filename = os.path.join('pdfs', pdf_link.split('/')[-1])
        print(f"Downloading {full_url} to {local_filename}")
        download_file(full_url, local_filename)

print("Download complete.")

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta charset="utf-8"/>
<title>GovInfo</title>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0" name="viewport"/>
<meta content="" name="description"/>
<meta content="" name="author"/>
<meta content="summary" name="twitter:card">
<meta content="www.govinfo.gov" name="twitter:site">
<meta content="" name="twitter:title">
<meta content="Official Publications from the U.S. Government Publishing Office." name="twitter:description">
<meta content="https://www.govinfo.gov/sites/default/files/media/govinfo_eagle_homepage.png" name="twitter:image">
<meta content="" name="twitter:url"/>
<meta content="" property="og:title"/>
<meta content="Official Publications from the U.S. Government Publishing Office." property="og:description"/>
<meta content="https://www.govinfo.gov/sites/default/files/media/govinfo_eagle_homepage.png" property="og:image"/>
<meta content="" property="og:u