In [49]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

dataset = pd.read_csv(
    'US-Legislative-congressional_bills_19.3_3_3.csv',
    encoding='ISO-8859-1',
    low_memory=False
)

print(dataset.head())

       id     bill_id  cong bill_type  bill_no          name_full  \
0  390521   80-S-1873    80         S     1873     Capehart, Home   
1   41199  80-HR-3000    80        HR     3000  Hoffman, Clare E.   
2   42931  80-HR-4743    80        HR     4743  Landis, Gerald W.   
3  390617    80-S-525    80         S      525     Thomas, Elbert   
4   39998  80-HR-1815    80        HR     1815  Case, Clifford P.   

                                         description intr_date  intr_month  \
0  A bill to maintain prosperity, promote full em...       NaN         NaN   
1  To amend the Employment Act of1946 so as to pr...       NaN         NaN   
2  To maintain prosperity, to promote full employ...       NaN         NaN   
3  A bill to promote the progress of science and ...       NaN         NaN   
4  To promote the progress of science; to advance...       NaN         NaN   

   year  ...  subtopic  filter_plaw  plaw_date  plaw_no  pass_h pass_s  \
0  1947  ...     100.0          0.0       

In [61]:
import re

#identify relevant phrases
bias_categories = {'Racial/Cultural': ['certain race','certain ethnicity', 'slave', 'off the reservation', 'certain Indians', 'prohibit people of color'],
                                    
                    'Economic': ['exclude poor', 'socioeconomic', 'wealth', 'certain benefits', 'decrease pensions', 'decrease in pensions',
                                 'exclude pensions', 'reform', 'cuts', 'taxpayers', 'only taxpayers'],
                                
                    'Gender': ['depending on gender', 'feminine', 'masculine', 'only males', 'sex', 'pregnant', 'pregnancy', 'woman', 'women',
                              'female', 'transgender', 'gay', 'love', 'gender'],
                
                    'Disability': ['retard', 'retarded', 'diability', 'handicap', 'medical condition', 'mental health',
                                  'healthcare'],
                   
                    'Religion': ['certain religions', 'only christians', 'no muslims', 'certain faiths', 'god', 'must be christian', 'no muslim',
                                'no muslims', 'worship'],
                   
                    'Age': ['certain age', 'certain elderly', 'only youths', 'only minors', 'certain teens'],
                   
                    'Criminal Justice' : ['policing', 'incarcerate','criminals','incarceration', 'probation', 'parole', 'felon'],
                   
                    'Education': ['school segregation', 'speak English', 'English speaker',
                                  'have to be American', 'school'],
                   
                    'Citizenship': ['illegal aliens', 'citizen' , 'citizenship', 'illegal', 'visa',
                                   'undocumented', 'immigrant'],
                   
                    'Multiple': ['favoring', 'bias', 'inequality', 'certain people', 'zones', 'mandatory']
                   
}

# illegal chars
ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x1F\x7F-\x9F]')

def clean_excel_string(s):
    if isinstance(s, str):
        return ILLEGAL_CHARACTERS_RE.sub('', s)
    return s

def contains_bias(description):
    if isinstance(description, str):
        for category, keywords in bias_categories.items():
           # Create a pattern that matches the exact word (case-insensitive)
            pattern = r'\b(?:' + '|'.join([re.escape(keyword) for keyword in keywords]) + r')\b'
            if re.search(pattern, description, re.IGNORECASE):
                return category
    return None
    
#only half of the data
subset = dataset.iloc[:len(dataset) // 2]

# Apply with progress update every 100,000 rows
bias_results = []
for idx, desc in enumerate(subset['description']):
    bias_results.append(contains_bias(desc))
    if (idx + 1) % 100000 == 0:
        print(f"Processed {idx + 1} descriptions...")

#subset['Bias Type'] = bias_results
# Assign 'Bias Type' directly to dataset (not subset)
dataset.loc[subset.index, 'Bias Type'] = bias_results

bias_laws = dataset[dataset['Bias Type'].notna()]

# Remove duplicates
bias_laws = bias_laws.drop_duplicates(subset=['description'])

bias_laws = bias_laws.drop_duplicates(subset=['bill_id'])



bias_laws['description'] = bias_laws['description'].apply(clean_excel_string)
bias_laws_sorted = bias_laws.sort_values(by='Bias Type')
bias_laws_sorted = bias_laws_sorted.rename(columns={'description': 'Policy', 'year': 'Year'})

bias_laws_sorted['Id'] = range(1, len(bias_laws_sorted) + 1)
bias_laws_sorted['Normative Framing'] = 'Explicit'
bias_laws_sorted['Source'] = 'https://www.comparativeagendas.net/project/us/datasets'
output_file = 'biased_congressional_bills(1).xlsx'

bias_laws_sorted[['Id', 'Policy','Year', 'Bias Type', 'Normative Framing', 'Source']].to_excel(output_file, index=False)

print(bias_laws_sorted[['Id', 'Policy','Year', 'Bias Type', 'Normative Framing', 'Source']])
print(f"Filtered laws have been saved to {output_file}")

Processed 100000 descriptions...
Processed 200000 descriptions...
          Id                                             Policy  Year  \
225308     1  To amend chapter 44 of title 18 of the United ...  1972   
52990      2  To amend title I of the Social Security Act to...  1955   
62881      3  To amend certain provisions of law relating to...  1956   
225414     4  To amend chapter 44 of title 18 of the United ...  1972   
82108      5  To amend the Internal Revenue Code of 1954 so ...  1958   
...      ...                                                ...   ...   
205762  3637  A bill to provide for the issuance of special ...  1970   
90181   3638  To amend the Internal Revenue Code of 1954 to ...  1959   
168872  3639  To require the words In God We Trust on the Su...  1967   
2914    3640  To provide a commission to supervise the const...  1947   
39847   3641  A bill to require that the motto In God We Tru...  1953   

          Bias Type Normative Framing  \
225308          