In [None]:
from datetime import datetime
from dotenv import load_dotenv
from openai import AsyncOpenAI
from pathlib import Path
from typing import Dict, List, Tuple

import asyncio
import csv
import json
import os
import openai
import re
import statistics

# stores your OpenAI API key in an environment variable
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# print(openai.api_key)

client = AsyncOpenAI()

In [None]:
def get_latest_version_markdown(json_path):
    """
    Load a JSON file and return the 'markdown' field from the object
    where 'latest_version' is 1.
    
    Parameters:
        json_path (str): Path to the JSON file.

    Returns:
        str: Markdown text of the latest version, or None if not found.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if isinstance(data, list):
        for item in data:
            if item.get("latest_version") == 1:
                return item.get("markdown")
    elif isinstance(data, dict):
        if data.get("latest_version") == 1:
            return data.get("markdown")
        for key, value in data.items():
            if isinstance(value, list):
                for item in value:
                    if isinstance(item, dict) and item.get("latest_version") == 1:
                        return item.get("markdown")

    return None

In [None]:
# deterministic bill analysis functions

def getCountOfAIWords(bill_text):
    """
    Count the number of times AI-related terms appear as complete words in the bill text.
    """
    ai_terms = {
        "artificial intelligence": r"\bartificial\s+intelligence\b",
        "AI": r"\bAI\b",  # Word boundary prevents matching inside other words
        "machine learning": r"\bmachine\s+learning\b",
        "deep learning": r"\bdeep\s+learning\b", 
        "neural networks": r"\bneural\s+networks?\b",  # Handles network/networks
        "natural language processing": r"\bnatural\s+language\s+processing\b",
        "computer vision": r"\bcomputer\s+vision\b",
        "automated decision-making": r"\bautomated\s+decision-?making\b",
        "LLM": r"\bLLM\b",
        "large language model": r"\blarge\s+language\s+models?\b",
        "generative AI": r"\bgenerative\s+AI\b",
    }
    
    ai_count = {}
    text_lower = bill_text.lower()
    
    for term, pattern in ai_terms.items():
        matches = re.findall(pattern, text_lower, re.IGNORECASE)
        ai_count[term] = len(matches)
    
    ai_count['total'] = sum(ai_count.values())
    return ai_count

def extract_ai_context_char_robust(bill_text, char_window=300):
    """
    Character-based extraction with edge case handling.
    """
    if not bill_text:
        return []
        
    ai_pattern = r'\bartificial\s+intelligence\b'
    contexts = []
    
    for match in re.finditer(ai_pattern, bill_text, re.IGNORECASE):
        start = max(0, match.start() - char_window)
        end = min(len(bill_text), match.end() + char_window)
        
        # Avoid word truncation at boundaries
        if start > 0:
            # Find start of word to avoid cutting mid-word
            while start > 0 and bill_text[start-1] not in ' \n\t.!?':
                start -= 1
                
        if end < len(bill_text):
            # Find end of word
            while end < len(bill_text) and bill_text[end] not in ' \n\t.!?':
                end += 1
        
        context = bill_text[start:end].strip()
        
        contexts.append({
            'position': match.start(),
            'char_range': f"{start}-{end}",
            'context': context,
            'matched_text': match.group(),
            'at_beginning': start == 0,
            'at_end': end >= len(bill_text)
        })
    
    return contexts



In [None]:
# LLM-based bill analysis functions

async def rateBillOneToTenAsync(bill_text):
    """
    Rate the bill text on a scale of 1 (least) to 10 (most) based on how much the bill is about AI / artificial intelligence.
    """
    system_prompt = """On a scale of 1 (least) - 10 (most), rate how much the following bill is about artificial intelligence.\n\nYou should respond with just a single integer, without any additional text."""    

    # OpenAI API call
    response = await client.chat.completions.create(
        model="chatgpt-4o-latest",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": bill_text}
        ],
        max_tokens=10,
        temperature=0.0,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        n=1,
        stop=None
    )

    return int(response.choices[0].message.content.strip())

async def getThreeSentenceSummaryAsync(bill_text):
    """
    Get a three-sentence summary of the bill text.
    """

    # I ended up not summarizing the bills, but this function could be used to do so if you'd like

    system_prompt = """Summarize the following bill in three sentences: What are the primary topics of the bill, who if anyone will take different action based on the bill, and what actions (if any) are required to be taken by private sector. Be careful to distinguish between requirements, recommendations, positive incentives, etc.\n\nYou should respond with just the summary, without any additional text."""    

    # OpenAI API call
    response = await client.chat.completions.create(
        model="chatgpt-4o-latest",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": bill_text}
        ],
        max_tokens=300,
        temperature=0.0,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        n=1,
        stop=None
    )

    return response.choices[0].message.content.strip()


async def rateBillOnSpecificPrivateSectorRequirementsAsync(bill_text):
    """
    Rate the bill text on a scale of 1 (least) to 10 (most) based on how much the bill imposes specific requirements about artificial intelligence on private-sector actors, like the companies developing advanced artificial intelligence models.
    """
    system_prompt = """On a scale of 1 (least) - 10 (most), rate how much the following bill imposes specific requirements about artificial intelligence on private-sector developers of artificial intelligence, like OpenAI and Anthropic.\n\nYou should respond with just a single integer, without any additional text."""

    # OpenAI API call
    response = await client.chat.completions.create(
        model="chatgpt-4o-latest",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": bill_text}
        ],
        max_tokens=10,
        temperature=0.0,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        n=1,
        stop=None
    )

    return int(response.choices[0].message.content.strip())

In [None]:
# aggregator analysis functions (run multiple analyses on a bill, analyze multiple bills)

# Running analyze_all_bills_in_folder will create a directory structure like:
# bill_analysis_results/
#   ├── ai_bill_analysis_incremental_20250630_143022.csv  (live progress tracking)
#   ├── ai_bill_analysis_master_20250630_143022.json      (cumulative results)
#   ├── AK2025000HCR3_analysis.json                       (individual bill result)
#   ├── CA2025000SB47_analysis.json                       (individual bill result)
#   └── ... (one JSON per analyzed bill)

async def analyze_single_bill(json_path: str) -> Dict:
    """
    Analyze a single bill from a JSON file.
    Returns a dictionary with analysis results.
    """
    result = {
        'file_path': json_path,
        'file_name': os.path.basename(json_path),
        'ratings': [],
        'average_rating': 0,
        'summary': None,
        'ai_word_counts': None,
        'ai_contexts': None,
        'error': None
    }

    max_file_size = 10 * 1024 * 1024  # 10 MB
    
    try:
        # only do if file size is <10 MB due to OpenAI API rate-limit issues, excludes four bills that are larger than 10 MB
        file_size = os.path.getsize(json_path)
        if file_size > max_file_size:
            result['error'] = "File size"
            return result

        # Get the markdown text
        markdown_text = get_latest_version_markdown(json_path)
        if not markdown_text:
            result['error'] = "No latest version found in JSON"
            return result
        
        # # Get an AI-relatedness rating; can get multiple ratings if desired
        rating_tasks = [rateBillOneToTenAsync(markdown_text) for _ in range(1)]
        # # rating_tasks = [rateBillOneToTenAsync(markdown_text) for _ in range(3)]
        ratings = await asyncio.gather(*rating_tasks)
        result['ratings'] = ratings
        if len(result['ratings']) > 0:
            result['average_rating'] = statistics.mean(ratings)
        else:
            result['average_rating'] = None
            
        # # Get AI word counts
        result['ai_word_counts'] = getCountOfAIWords(markdown_text)
        
        # # Get AI contexts (the context around mentions of "artificial intelligence"; currently it is character-based which isn't quite long enough to make sense of
        # would lengthen this out or switch to sentence-based extraction)
        result['ai_contexts'] = extract_ai_context_char_robust(markdown_text)

        # Requirement assessment section
        # (should probably only run this on the subset that are in fact AI-related, in my case an average_rating of 6 or higher)
        if result['average_rating'] >= 6:
            requirement_assessment = await rateBillOnSpecificPrivateSectorRequirementsAsync(markdown_text)
            result['requirement_assessment'] = requirement_assessment
            # summarize the bill if you want
            # result['summary'] = await getThreeSentenceSummaryAsync(markdown_text)
            
    except Exception as e:
        result['error'] = str(e)

        print(f"Error processing {json_path}: {e}")
    
    return result

async def analyze_all_bills_in_folder(folder_path: str, max_concurrent: int = 2, output_dir: str = "bill_analysis_results") -> List[Dict]:
    """
    Analyze all JSON files in a folder with controlled concurrency.
    Saves results incrementally as each bill is processed.
    
    Args:
        folder_path: Path to folder containing JSON files
        max_concurrent: Maximum number of concurrent API calls
        output_dir: Directory to save incremental results
    
    Returns:
        List of analysis results for each bill
    """
    from datetime import datetime
    import csv
    
    # Create output directory if it doesn't exist
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Create timestamp for this run
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # # Initialize CSV file for incremental results
    csv_file = output_path / f"ai_bill_analysis_incremental_{timestamp}.csv"
    with open(csv_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Timestamp', 'File Name', 'Average Rating', 'Rating 1', 'Rating 2', 'Rating 3', 'AI Term Count', 'Summary (truncated)'])
    
    # Get all JSON files in the folder
    json_files = list(Path(folder_path).glob("*.json"))
    
    if not json_files:
        print(f"No JSON files found in {folder_path}")
        return []
    
    print(f"Found {len(json_files)} JSON files to analyze")
    print(f"Results will be saved to: {output_path}")
    
    # Create a semaphore to limit concurrent requests
    semaphore = asyncio.Semaphore(max_concurrent)
    results = []
    error_files = []
    
    async def analyze_with_semaphore(json_path, index):
        async with semaphore:
            print(f"[{index+1}/{len(json_files)}] Analyzing {json_path.name}...")
            result = await analyze_single_bill(str(json_path))

            if result['error']:
                # add filename to error list for a retry later
                error_files.append((json_path.name, result['error']))
                print(f"Error processing {json_path.name}: {result['error']}")
            
            # Save individual result as JSON
            individual_result_file = output_path / f"{json_path.stem}_analysis.json"
            with open(individual_result_file, 'w') as f:
                json.dump(result, f, indent=2, default=str)
            
            # # Append to CSV file
            with open(csv_file, 'a', newline='') as f:
                writer = csv.writer(f)
                ai_count = result['ai_word_counts']['total'] if result['ai_word_counts'] else 0
                summary_truncated = result['summary'][:200] + '...' if result['summary'] and len(result['summary']) > 200 else result['summary']
                writer.writerow([
                    datetime.now().isoformat(),
                    result['file_name'],
                    f"{result['average_rating']:.1f}",
                    result['ratings'][0] if len(result['ratings']) > 0 else 'N/A',
                    result['ratings'][1] if len(result['ratings']) > 1 else 'N/A',
                    result['ratings'][2] if len(result['ratings']) > 2 else 'N/A',
                    ai_count,
                    summary_truncated
                ])
            
            # Update master results file
            master_file = output_path / f"ai_bill_analysis_master_{timestamp}.json"
            results.append(result)
            with open(master_file, 'w') as f:
                json.dump(results, f, indent=2, default=str)
            
            print(f"[{index+1}/{len(json_files)}] Completed {json_path.name} - Average rating: {result['average_rating']:.1f} - Saved to {individual_result_file.name}")
            # print for the requirement assessment
            if 'requirement_assessment' in result:
                # (there's a chance this printline errors out - I ran requirement assessment separately from average rating, so didn't have both printlines intact at once)
                print(f"  - Requirement Assessment: {result['requirement_assessment']}")
            return result
    
    # Analyze all bills
    tasks = [analyze_with_semaphore(json_file, i) for i, json_file in enumerate(json_files)]
    results = await asyncio.gather(*tasks)
    
    # Print summary
    print(f"\nAnalysis complete!")
    print(f"Total bills analyzed: {len(results)}")
    # print(f"Bills with high AI relevance (≥7): {len(high_ai_bills)}")
    print(f"\nAll results saved to: {output_path}")
    print(f"  - Individual JSONs: *_analysis.json")
    # print(f"  - Incremental CSV: {csv_file.name}")
    print(f"  - Master JSON: ai_bill_analysis_master_{timestamp}.json")
    
    return results, error_files

In [None]:
# Example of how to run the analysis:

folder_path = "bill_texts"
output_dir = "bill_texts_results"  # Results will be saved here incrementally

# This will save results as each bill is processed
results, error_files = await analyze_all_bills_in_folder(
    folder_path, 
    max_concurrent=2,
    output_dir=output_dir
)

# add print-lines as wanted
for result in results:
    if result['average_rating'] >= 0:
        print(f"\nFile: {result['file_name']}")
        print(f"Ratings: {result['ratings']} (Average: {result['average_rating']:.1f})")
        if result['summary']:
            print(f"\nSummary:\n{result['summary']}")
        print("-"*80)

# get a file with errors so you know which if any bills to re-run
# a few are too long for standard OpenAI API rate-limits, but more commonly they're fine so long as you haven't done too many tokens in close proximity
# standard gpt-4o ratelimit is 30K tokens per minute 
error_file_path = output_dir + "/error_files.txt"
with open(error_file_path, 'w') as f:
    for file_name, error in error_files:
        f.write(f"{file_name}: {error}\n")
        print(f"  - {file_name}: {error}")

print("Careful! If you are doing any analysis, make sure to exclude bills that are rated 0/10 on AI-relatedness or requirements, as 0/10 is a sign of an error, not an actual rating.")



In [None]:
# create a new folder with these files in it
bills_to_analyze = [
    "FILL_IN_WITH_PATHS.json",
]

# create a new folder with these files in it
output_folder = "folder_to_use_as_input_to_analysis"
os.makedirs(output_folder, exist_ok=True)
for bill in bills_to_analyze:
    bill_path = Path("bill_texts") / bill
    if bill_path.exists():
        with open(bill_path, 'r') as f:
            bill_data = json.load(f)
            with open(Path(output_folder) / bill, 'w') as out_f:
                json.dump(bill_data, out_f, indent=2)
    else:
        print(f"Bill file {bill} does not exist in the original folder.")


In [None]:
# analysis of a single bill, specified by path
bill_path = "bill_texts/NY2025000A6453.json"
bill_text = get_latest_version_markdown(bill_path)
# now run async analysis on whatever you'd like
rating = await rateBillOneToTenAsync(bill_text)
print(f"Rating for {bill_path}: {rating}")