In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
import ast
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


### 1. Extract all text into a df 

In [3]:
sessions = [112,113,114,115,116,117,118,119]
text_dict = {}

for session in sessions:

    df_rows = []
    
    bills_path = f"./data/congress_api_data_10_21/{session}/text"

    for file in os.listdir(bills_path):

        match = re.search(r'(\d+)[^\da-zA-Z]*([A-Za-z]+)[^\da-zA-Z]*(\d+)', file, re.IGNORECASE)
        if match:
            result = f"{match.group(1)}-{match.group(2).lower()}{match.group(3)}"
            bill_id = result 
        else:
            bill_id = ""

        file_path = os.path.join(bills_path, file)

        with open(file_path) as file:
            soup = BeautifulSoup(file, 'html.parser')

            df_rows.append({
                "bill_id": bill_id,
                "text": soup.get_text(separator=" ", strip=True).replace("\n"," ")
            })

    df = pd.DataFrame(df_rows)
    text_dict[session] = df
    

In [4]:
all_text = pd.concat(text_dict).reset_index().drop(columns=['level_1']).rename(columns={'level_0':'congress'})

all_text

Unnamed: 0,congress,bill_id,text
0,112,112-hr6397,[Congressional Bills 112th Congress] [From the...
1,112,112-hr6361,[Congressional Bills 112th Congress] [From the...
2,112,112-hr5884,[Congressional Bills 112th Congress] [From the...
3,112,112-hr6395,[Congressional Bills 112th Congress] [From the...
4,112,112-hr5940,[Congressional Bills 112th Congress] [From the...
...,...,...,...
1258,119,119-hr889,[Congressional Bills 119th Congress] [From the...
1259,119,119-s965,[Congressional Bills 119th Congress] [From the...
1260,119,119-hr5387,[Congressional Bills 119th Congress] [From the...
1261,119,119-hr4457,[Congressional Bills 119th Congress] [From the...


In [5]:
all_text['word_count'] = all_text['text'].apply(lambda x: len(word_tokenize(x)))

all_text.sort_values(by=['word_count'],axis=0,ascending=False)


Unnamed: 0,congress,bill_id,text,word_count
45,113,113-hr2767,[Congressional Bills 113th Congress] [From the...,61336
390,115,115-hr6746,[Congressional Bills 115th Congress] [From the...,60543
1257,119,119-s2651,[Congressional Bills 119th Congress] [From the...,60468
963,118,118-hr6970,[Congressional Bills 118th Congress] [From the...,50830
824,117,117-s2820,[Congressional Bills 117th Congress] [From the...,49019
...,...,...,...,...
250,114,114-hr88,[Congressional Bills 114th Congress] [From the...,201
1014,118,118-s3755,[Congressional Bills 118th Congress] [From the...,194
897,118,118-s5111,[Congressional Bills 118th Congress] [From the...,192
906,118,118-hr9694,[Congressional Bills 118th Congress] [From the...,185


In [97]:
all_text["word_count"].describe()

count      863.000000
mean      2565.741599
std       4772.205085
min        178.000000
25%        617.000000
50%       1276.000000
75%       2445.500000
max      60468.000000
Name: word_count, dtype: float64

In [6]:
stop_words = list(text.ENGLISH_STOP_WORDS.union([
    'section', 'shall', 'act', 'subsection', 'ii', 'sec'  # add your own here
]))

vectorizer = CountVectorizer(stop_words=stop_words, max_features=1000)  # limit to top 1000 features
X = vectorizer.fit_transform(all_text['text'])

In [7]:
vocab = vectorizer.get_feature_names_out()
word_freq = X.toarray().sum(axis=0)
top_words = pd.DataFrame({'word': vocab, 'count': word_freq}).sort_values('count', ascending=False)


In [8]:
print("Number of documents:", len(all_text))
print("Average document length (words):", all_text['word_count'].mean())
print("\nTop 10 most frequent words:")
print(top_words.head(25))

Number of documents: 1263
Average document length (words): 2481.8558986539983

Top 10 most frequent words:
            word  count
471      housing  34553
845    secretary  17732
143   assistance  10513
715      program   9527
736       public   9417
344     eligible   8303
935         term   8086
655    paragraph   7983
311  development   7510
890       states   7116
400      federal   6387
56            42   6232
966       united   6173
100       agency   6059
492    including   5930
942        title   5851
888        state   5850
599     mortgage   5696
413    following   5590
438        grant   5567
493       income   5458
428      general   5370
234     congress   5243
282         date   5209
218    community   5147
