In [7]:
import requests
import json
import pandas as pd
import time
import os
from datetime import datetime

def get_programs_for_specialty(spec_code, session, location_mapping):
    """Scrape programs for a single specialty"""
    result_list = []
    
    for loc_code, states in location_mapping.items():
        offset = 0
        limit = 25
        has_more = True
        
        while has_more:
            # Build filter values for states
            filter_values = "&".join([f"filter[location][condition][value][]={state}" for state in states])
            
            # Build API URL
            api_url = (
                f"https://freida-admin.ama-assn.org/api/node/program?"
                f"fields[node--program]=title,path,field_address,field_program_id,field_specialty"
                f"&include=field_specialty,field_survey"
                f"&page[offset]={offset}"
                f"&page[limit]={limit}"
                f"&filter[specialty][condition][operator]=IN"
                f"&filter[specialty][condition][path]=field_specialty.drupal_internal__nid"
                f"&filter[specialty][condition][value][]={spec_code}"
                f"&filter[location][condition][operator]=IN"
                f"&filter[location][condition][path]=field_address.administrative_area"
                f"&{filter_values}"
                f"&sort=-field_survey.field_first_year_positions,title"
            )
            
            try:
                response = session.get(api_url)
                
                if response.status_code == 200:
                    data = response.json()
                    programs = data.get('data', [])
                    
                    if not programs:
                        has_more = False
                    else:
                        for program in programs:
                            try:
                                attributes = program.get('attributes', {})
                                
                                program_info = {
                                    'program_id': attributes.get('field_program_id', 'UNKNOWN'),
                                    'title': attributes.get('title', 'UNKNOWN'),
                                    'spec_code': spec_code,
                                    'location_code': loc_code
                                }
                                
                                address = attributes.get('field_address', {})
                                program_info['state'] = address.get('administrative_area', 'UNKNOWN')
                                program_info['city'] = address.get('locality', 'UNKNOWN')
                                
                                # Get detailed info
                                if program_info['program_id'] != 'UNKNOWN':
                                    detail_url = (
                                        f"https://freida-admin.ama-assn.org/api/node/program?"
                                        f"filter[field_program_id]={program_info['program_id']}"
                                        f"&include=field_survey.field_program_director,field_survey.field_program_contact"
                                    )
                                    
                                    detail_response = session.get(detail_url)
                                    if detail_response.status_code == 200:
                                        detail_data = detail_response.json()
                                        
                                        if 'included' in detail_data:
                                            for item in detail_data['included']:
                                                if item.get('type') == 'paragraph--program_individual':
                                                    attrs = item.get('attributes', {})
                                                    if attrs.get('parent_field_name') == 'field_program_director':
                                                        program_info['first_name'] = attrs.get('field_first_name', 'UNKNOWN')
                                                        program_info['last_name'] = attrs.get('field_last_name', 'UNKNOWN')
                                                        program_info['email'] = attrs.get('field_email', 'UNKNOWN')
                                                        addr = attrs.get('field_address', {})
                                                        program_info['org'] = addr.get('organization', 'UNKNOWN')
                                                        break
                                
                                # Fill missing fields
                                for field in ['first_name', 'last_name', 'email', 'org']:
                                    if field not in program_info:
                                        program_info[field] = 'UNKNOWN'
                                
                                result_list.append(program_info)
                                
                            except Exception as e:
                                continue
                        
                        offset += limit
                        
                        # Check for next page
                        links = data.get('links', {})
                        if not links.get('next'):
                            has_more = False
                        
                        time.sleep(0.5)  # Rate limiting
                else:
                    has_more = False
                    
            except Exception as e:
                print(f"    Error: {e}")
                has_more = False
    
    return result_list

def scrape_all_residencies():
    """Main function to scrape all residency programs"""
    
    all_residencies = [
        336551, 1500546, 1500541, 42641, 42646, 42896, 294931, 42686, 43496,
        42701, 43516, 43511, 42736, 43466, 43461, 42771, 43451, 43506, 43411,
        43501, 43446, 43491, 43406, 1224691, 43456, 43416, 294936, 42756,
        43471, 42866, 42876, 1224696, 42926, 42931, 336556, 42956, 42966,
        43011, 43016, 43031, 43086, 43431, 43426, 43486, 43441, 43436, 43176,
        43201, 43211, 43236, 43421, 43476, 43221, 43281, 43481, 43326, 43521,
        43376, 43356
    ]
    
    location_mapping = {
        "01": ["01", "CT", "MA", "ME", "NH", "RI", "VT"],
        "02": ["02", "NJ", "NY"],
        "03": ["03", "DE", "MD", "PA", "VA", "WV", "DC"],
        "04": ["04", "AL", "FL", "GA", "KY", "MS", "NC", "SC", "TN"],
        "05": ["05", "IL", "IN", "MI", "OH", "WI"],
        "06": ["06", "AR", "LA", "NM", "OK", "TX"],
        "07": ["07", "IA", "KS", "MN", "MO", "NE", "ND", "SD"],
        "08": ["08", "CO", "ID", "MT", "UT", "WY"],
        "09": ["09", "AK", "AZ", "CA", "HI", "NV", "OR", "WA"],
        "PR": ["PR"]
    }
    
    # Setup session
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json, text/plain, */*',
        'Referer': 'https://freida.ama-assn.org/'
    })
    
    # Master list to hold all results
    all_results = []
    
    # CSV file path
    csv_path = "all_residency_programs.csv"
    
    # Track progress
    total_specialties = len(all_residencies)
    
    print(f"Starting scrape of {total_specialties} specialties")
    print("=" * 50)
    
    for idx, spec_code in enumerate(all_residencies, 1):
        print(f"\nSpecialty {idx}/{total_specialties}: {spec_code}")
        print("-" * 30)
        
        try:
            # Get programs for this specialty
            programs = get_programs_for_specialty(spec_code, session, location_mapping)
            
            if programs:
                print(f"  Found {len(programs)} programs")
                all_results.extend(programs)
                
                # Save after each specialty (in case of crashes)
                df = pd.DataFrame(all_results)
                df.to_csv(csv_path, index=False)
                print(f"  Total collected so far: {len(all_results)}")
            else:
                print(f"  No programs found")
                
        except Exception as e:
            print(f"  ERROR with specialty {spec_code}: {e}")
            continue
        
        # Longer delay between specialties
        time.sleep(2)
    
    # Final save and summary
    if all_results:
        df = pd.DataFrame(all_results)
        
        # Remove duplicates based on program_id
        original_count = len(df)
        df = df.drop_duplicates(subset=['program_id'], keep='last')
        duplicates_removed = original_count - len(df)
        
        # Save final CSV
        df.to_csv(csv_path, index=False)
        
        print("\n" + "=" * 50)
        print("SCRAPING COMPLETE")
        print(f"Total programs collected: {len(df)}")
        print(f"Duplicates removed: {duplicates_removed}")
        print(f"Saved to: {csv_path}")
        
        # Basic statistics
        print("\nPrograms by state (top 10):")
        state_counts = df['state'].value_counts().head(10)
        for state, count in state_counts.items():
            print(f"  {state}: {count}")
    else:
        print("\nNo programs were collected")

# Run the scraper
if __name__ == "__main__":
    start_time = datetime.now()
    scrape_all_residencies()
    end_time = datetime.now()
    print(f"\nTotal runtime: {end_time - start_time}")

Starting scrape of 59 specialties

Specialty 1/59: 336551
------------------------------
  Found 4 programs
  Total collected so far: 4

Specialty 2/59: 1500546
------------------------------
  Found 1 programs
  Total collected so far: 5

Specialty 3/59: 1500541
------------------------------
  Found 1 programs
  Total collected so far: 6

Specialty 4/59: 42641
------------------------------
  Found 92 programs
  Total collected so far: 98

Specialty 5/59: 42646
------------------------------
  Found 182 programs
  Total collected so far: 280

Specialty 6/59: 42896
------------------------------
  Found 83 programs
  Total collected so far: 363

Specialty 7/59: 294931
------------------------------
  Found 14 programs
  Total collected so far: 377

Specialty 8/59: 42686
------------------------------
  Found 145 programs
  Total collected so far: 522

Specialty 9/59: 43496
------------------------------
  Found 3 programs
  Total collected so far: 525

Specialty 10/59: 42701
---------

In [None]:
#This is for the program contact

def get_programs_for_specialty(spec_code, session, location_mapping):
    """Scrape programs for a single specialty"""
    result_list = []
    
    for loc_code, states in location_mapping.items():
        offset = 0
        limit = 25
        has_more = True
        
        while has_more:
            # Build filter values for states
            filter_values = "&".join([f"filter[location][condition][value][]={state}" for state in states])
            
            # Build API URL
            api_url = (
                f"https://freida-admin.ama-assn.org/api/node/program?"
                f"fields[node--program]=title,path,field_address,field_program_id,field_specialty"
                f"&include=field_specialty,field_survey"
                f"&page[offset]={offset}"
                f"&page[limit]={limit}"
                f"&filter[specialty][condition][operator]=IN"
                f"&filter[specialty][condition][path]=field_specialty.drupal_internal__nid"
                f"&filter[specialty][condition][value][]={spec_code}"
                f"&filter[location][condition][operator]=IN"
                f"&filter[location][condition][path]=field_address.administrative_area"
                f"&{filter_values}"
                f"&sort=-field_survey.field_first_year_positions,title"
            )
            
            try:
                response = session.get(api_url)
                
                if response.status_code == 200:
                    data = response.json()
                    programs = data.get('data', [])
                    
                    if not programs:
                        has_more = False
                    else:
                        for program in programs:
                            try:
                                attributes = program.get('attributes', {})
                                
                                program_info = {
                                    'program_id': attributes.get('field_program_id', 'UNKNOWN'),
                                    'title': attributes.get('title', 'UNKNOWN'),
                                    'spec_code': spec_code,
                                    'location_code': loc_code
                                }
                                
                                address = attributes.get('field_address', {})
                                program_info['state'] = address.get('administrative_area', 'UNKNOWN')
                                program_info['city'] = address.get('locality', 'UNKNOWN')
                                
                                # Get detailed info - CHANGED TO GET PROGRAM CONTACT
                                if program_info['program_id'] != 'UNKNOWN':
                                    detail_url = (
                                        f"https://freida-admin.ama-assn.org/api/node/program?"
                                        f"filter[field_program_id]={program_info['program_id']}"
                                        f"&include=field_survey.field_program_director,field_survey.field_program_contact"
                                    )
                                    
                                    detail_response = session.get(detail_url)
                                    if detail_response.status_code == 200:
                                        detail_data = detail_response.json()
                                        
                                        if 'included' in detail_data:
                                            for item in detail_data['included']:
                                                if item.get('type') == 'paragraph--program_individual':
                                                    attrs = item.get('attributes', {})
                                                    # CHANGED: Looking for field_program_contact instead of field_program_director
                                                    if attrs.get('parent_field_name') == 'field_program_contact':
                                                        program_info['contact_first_name'] = attrs.get('field_first_name', 'UNKNOWN')
                                                        program_info['contact_last_name'] = attrs.get('field_last_name', 'UNKNOWN')
                                                        program_info['contact_email'] = attrs.get('field_email', 'UNKNOWN')
                                                        program_info['contact_phone'] = attrs.get('field_phone', 'UNKNOWN')  # Added phone field
                                                        program_info['contact_title'] = attrs.get('field_title', 'UNKNOWN')  # Added title field
                                                        addr = attrs.get('field_address', {})
                                                        program_info['contact_org'] = addr.get('organization', 'UNKNOWN')
                                                        break
                                
                                # Fill missing fields - CHANGED FIELD NAMES
                                for field in ['contact_first_name', 'contact_last_name', 'contact_email', 'contact_phone', 'contact_title', 'contact_org']:
                                    if field not in program_info:
                                        program_info[field] = 'UNKNOWN'
                                
                                result_list.append(program_info)
                                
                            except Exception as e:
                                continue
                        
                        offset += limit
                        
                        # Check for next page
                        links = data.get('links', {})
                        if not links.get('next'):
                            has_more = False
                        
                        time.sleep(0.5)  # Rate limiting
                else:
                    has_more = False
                    
            except Exception as e:
                print(f"    Error: {e}")
                has_more = False
    
    return result_list

def scrape_all_residencies():
    """Main function to scrape all residency programs"""
    
    all_residencies = [
        336551, 1500546, 1500541, 42641, 42646, 42896, 294931, 42686, 43496,
        42701, 43516, 43511, 42736, 43466, 43461, 42771, 43451, 43506, 43411,
        43501, 43446, 43491, 43406, 1224691, 43456, 43416, 294936, 42756,
        43471, 42866, 42876, 1224696, 42926, 42931, 336556, 42956, 42966,
        43011, 43016, 43031, 43086, 43431, 43426, 43486, 43441, 43436, 43176,
        43201, 43211, 43236, 43421, 43476, 43221, 43281, 43481, 43326, 43521,
        43376, 43356
    ]
    
    location_mapping = {
        "01": ["01", "CT", "MA", "ME", "NH", "RI", "VT"],
        "02": ["02", "NJ", "NY"],
        "03": ["03", "DE", "MD", "PA", "VA", "WV", "DC"],
        "04": ["04", "AL", "FL", "GA", "KY", "MS", "NC", "SC", "TN"],
        "05": ["05", "IL", "IN", "MI", "OH", "WI"],
        "06": ["06", "AR", "LA", "NM", "OK", "TX"],
        "07": ["07", "IA", "KS", "MN", "MO", "NE", "ND", "SD"],
        "08": ["08", "CO", "ID", "MT", "UT", "WY"],
        "09": ["09", "AK", "AZ", "CA", "HI", "NV", "OR", "WA"],
        "PR": ["PR"]
    }
    
    # Setup session
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json, text/plain, */*',
        'Referer': 'https://freida.ama-assn.org/'
    })
    
    # Master list to hold all results
    all_results = []
    
    # CSV file path - CHANGED FILENAME
    csv_path = "all_residency_programs_contacts.csv"
    
    # Track progress
    total_specialties = len(all_residencies)
    
    print(f"Starting scrape of {total_specialties} specialties (Program Contacts)")
    print("=" * 50)
    
    for idx, spec_code in enumerate(all_residencies, 1):
        print(f"\nSpecialty {idx}/{total_specialties}: {spec_code}")
        print("-" * 30)
        
        try:
            # Get programs for this specialty
            programs = get_programs_for_specialty(spec_code, session, location_mapping)
            
            if programs:
                print(f"  Found {len(programs)} programs")
                all_results.extend(programs)
                
                # Save after each specialty (in case of crashes)
                df = pd.DataFrame(all_results)
                df.to_csv(csv_path, index=False)
                print(f"  Total collected so far: {len(all_results)}")
            else:
                print(f"  No programs found")
                
        except Exception as e:
            print(f"  ERROR with specialty {spec_code}: {e}")
            continue
        
        # Longer delay between specialties
        time.sleep(2)
    
    # Final save and summary
    if all_results:
        df = pd.DataFrame(all_results)
        
        # Remove duplicates based on program_id
        original_count = len(df)
        df = df.drop_duplicates(subset=['program_id'], keep='last')
        duplicates_removed = original_count - len(df)
        
        # Save final CSV
        df.to_csv(csv_path, index=False)
        
        print("\n" + "=" * 50)
        print("SCRAPING COMPLETE - PROGRAM CONTACTS")
        print(f"Total programs collected: {len(df)}")
        print(f"Duplicates removed: {duplicates_removed}")
        print(f"Saved to: {csv_path}")
        
        # Basic statistics
        print("\nPrograms by state (top 10):")
        state_counts = df['state'].value_counts().head(10)
        for state, count in state_counts.items():
            print(f"  {state}: {count}")
            
        # Show sample of contact data
        print("\nSample contact data (first 3 programs with contacts):")
        sample = df[df['contact_email'] != 'UNKNOWN'].head(3)
        for _, row in sample.iterrows():
            print(f"  {row['title'][:50]}...")
            print(f"    Contact: {row['contact_first_name']} {row['contact_last_name']}")
            print(f"    Email: {row['contact_email']}")
    else:
        print("\nNo programs were collected")

# Run the scraper
if __name__ == "__main__":
    start_time = datetime.now()
    scrape_all_residencies()
    end_time = datetime.now()
    print(f"\nTotal runtime: {end_time - start_time}")