In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
import ast
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


### 1. Extract all text into a df 

In [4]:
sessions = [110,111,112,113,114,115,116,117,118,119]
text_dict = {}

for session in sessions:

    df_rows = []
    
    bills_path = f"../data/congress_api_data_10_21/{session}/text"

    for file in os.listdir(bills_path):

        match = re.search(r'(\d+)[^\da-zA-Z]*([A-Za-z]+)[^\da-zA-Z]*(\d+)', file, re.IGNORECASE)
        if match:
            result = f"{match.group(1)}-{match.group(2).lower()}{match.group(3)}"
            bill_id = result 
        else:
            bill_id = ""

        file_path = os.path.join(bills_path, file)

        with open(file_path) as file:
            soup = BeautifulSoup(file, 'html.parser')

            df_rows.append({
                "bill_id": bill_id,
                "text": soup.get_text(separator=" ", strip=True).replace("\n"," ")
            })

    df = pd.DataFrame(df_rows)
    text_dict[session] = df
    

In [5]:
all_text = pd.concat(text_dict).reset_index().drop(columns=['level_1']).rename(columns={'level_0':'congress'})

all_text

Unnamed: 0,congress,bill_id,text
0,110,110-hr1752,[Congressional Bills 110th Congress] [From the...
1,110,110-hr1999,[Congressional Bills 110th Congress] [From the...
2,110,110-s1386,[Congressional Bills 110th Congress] [From the...
3,110,110-hr1227,[Congressional Bills 110th Congress] [From the...
4,110,110-hr1515,[Congressional Bills 110th Congress] [From the...
...,...,...,...
1652,119,119-hr889,[Congressional Bills 119th Congress] [From the...
1653,119,119-s965,[Congressional Bills 119th Congress] [From the...
1654,119,119-hr5387,[Congressional Bills 119th Congress] [From the...
1655,119,119-hr4457,[Congressional Bills 119th Congress] [From the...


In [6]:
all_text['word_count'] = all_text['text'].apply(lambda x: len(word_tokenize(x)))

all_text.sort_values(by=['word_count'],axis=0,ascending=False)


Unnamed: 0,congress,bill_id,text,word_count
20,110,110-hr1427,[Congressional Bills 110th Congress] [From the...,74252
439,113,113-hr2767,[Congressional Bills 113th Congress] [From the...,61336
784,115,115-hr6746,[Congressional Bills 115th Congress] [From the...,60543
1651,119,119-s2651,[Congressional Bills 119th Congress] [From the...,60468
11,110,110-s1100,[Congressional Bills 110th Congress] [From the...,55972
...,...,...,...,...
1300,118,118-hr9694,[Congressional Bills 118th Congress] [From the...,185
24,110,110-hr935,[Congressional Bills 110th Congress] [From the...,183
32,110,110-hr384,[Congressional Bills 110th Congress] [From the...,180
1285,118,118-hr5755,[Congressional Bills 118th Congress] [From the...,178


In [7]:
all_text["word_count"].describe()

count     1657.000000
mean      2637.670489
std       5501.527797
min        174.000000
25%        556.000000
50%       1216.000000
75%       2449.000000
max      74252.000000
Name: word_count, dtype: float64

In [12]:
stop_words = list(text.ENGLISH_STOP_WORDS.union([
    "shall","section","subsection","paragraph",
                     "ii","i","iii","sec","senate","bill","amend",
                     "enacted","act","secretary","subparagraph",
                     "title","may"  # add your own here
]))

vectorizer = CountVectorizer(stop_words=stop_words, max_features=1000)  # limit to top 1000 features
X = vectorizer.fit_transform(all_text['text'])

In [13]:
vocab = vectorizer.get_feature_names_out()
word_freq = X.toarray().sum(axis=0)
top_words = pd.DataFrame({'word': vocab, 'count': word_freq}).sort_values('count', ascending=False)


In [14]:
print("Number of documents:", len(all_text))
print("Average document length (words):", all_text['word_count'].mean())
print("\nTop 10 most frequent words:")
print(top_words.head(25))

Number of documents: 1657
Average document length (words): 2637.670488835244

Top 10 most frequent words:
            word  count
483      housing  44776
148   assistance  13727
721      program  12792
741       public  12328
606     mortgage  11118
938         term  11013
353     eligible  10061
319  development   9899
409      federal   9654
897       states   9414
105       agency   8877
561         loan   8348
969       united   8225
422    following   8043
504    including   7921
895        state   7825
60            42   7524
439      general   7512
448        grant   7282
289         date   7240
505       income   6962
242     congress   6921
996         year   6808
725      project   6493
972        urban   6380
