In [1]:
# DataFrame Batch Creation Example
# Load ~500 restaurants in Seoul, Korea from CSV and create a batch using the new DataFrame functionality
# This demonstrates: column mapping, validation, and batch creation with RepromptClient

import pandas as pd

df = pd.read_csv('seoul-restaurants.csv')
print(f"Loaded {len(df)} restaurants from CSV")
print(f"Original columns: {list(df.columns)}")
print("\nFirst 5 rows:")
df.head()

Loaded 539 restaurants from CSV
Original columns: ['name', 'rating', 'rating_count', 'types', 'address', 'region_level_1', 'phone', 'website', 'weighted_score']

First 5 rows:


Unnamed: 0,name,rating,rating_count,types,address,region_level_1,phone,website,weighted_score
0,Myeongdong Kyoja Main Restaurant,4.2,13470,"[dumpling_restaurant, kalguksu_restaurant, kor...","29 Myeongdong 10-gil, Jung District, Seoul, So...",Seoul,+82 2-776-5348,http://www.mdkj.co.kr/,56574.0
1,Tosokchon Samgyetang,4.2,11409,"[samgyetang_restaurant, restaurant]","5 Jahamun-ro 5-gil, Jongno District, Seoul, So...",Seoul,+82 2-737-7444,http://tosokchon.com/,47917.8
2,Ujin Haejangguk,4.1,9214,"[korean_restaurant, hangover_soup_restaurant]","11 Seosa-ro, ÌäπÎ≥ÑÏûêÏπòÎèÑ, Jeju-si, Jeju-do, South Korea",Jeju-do,+82 64-757-3393,,37777.4
3,Ungteori Myeongdong Korean BBQ Restaurant | Al...,4.8,7070,"[korean_barbecue_restaurant, barbecue_restaura...","South Korea, Seoul, Jung District, 4Í∏∏ 23 2Ï∏µ",Seoul,+82 10-9511-0020,http://www.ungteori.com/,33936.0
4,Gozip Dol Wooluck Jungmun,4.7,6683,[fish_restaurant],"879 Iljuseo-ro, ÌäπÎ≥ÑÏûêÏπòÎèÑ Seogwipo-si, Jeju-do, So...",Jeju-do,+82 64-738-1540,https://www.gozipfish.com/,31410.1


In [None]:
import os
# Replace with your actual API key and organization slug
API_KEY = os.getenv("REPROMPT_API_KEY")  # Get from https://app.repromptai.com
ORG_SLUG = os.getenv("REPROMPT_ORG_SLUG")  # Your organization identifier

In [3]:
# First, let's prepare the DataFrame for batch creation
# We need to rename 'address' to 'full_address' to match the expected format

# Take a subset for demo (first 10 restaurants)
demo_df = df.head(10).copy()

# Rename address column to match expected format
demo_df = demo_df.rename(columns={'address': 'full_address'})

# We only need 'name' and 'full_address' for batch creation
batch_df = demo_df[['name', 'full_address']].copy()

print("Prepared DataFrame for batch creation:")
print(batch_df.head())
print(f"\nDataFrame shape: {batch_df.shape}")
print(f"Required columns present: {set(['name', 'full_address']).issubset(batch_df.columns)}")


Prepared DataFrame for batch creation:
                                                name  \
0                   Myeongdong Kyoja Main Restaurant   
1                               Tosokchon Samgyetang   
2                                    Ujin Haejangguk   
3  Ungteori Myeongdong Korean BBQ Restaurant | Al...   
4                          Gozip Dol Wooluck Jungmun   

                                        full_address  
0  29 Myeongdong 10-gil, Jung District, Seoul, So...  
1  5 Jahamun-ro 5-gil, Jongno District, Seoul, So...  
2  11 Seosa-ro, ÌäπÎ≥ÑÏûêÏπòÎèÑ, Jeju-si, Jeju-do, South Korea  
3        South Korea, Seoul, Jung District, 4Í∏∏ 23 2Ï∏µ  
4  879 Iljuseo-ro, ÌäπÎ≥ÑÏûêÏπòÎèÑ Seogwipo-si, Jeju-do, So...  

DataFrame shape: (10, 2)
Required columns present: True


In [None]:
# Import RepromptClient and create batch from DataFrame
from reprompt import RepromptClient
from reprompt.generated.models import AttributeSet

# Create client with write permissions enabled
# Note: allow_writes=True is required for batch creation
client = RepromptClient(
    api_key=API_KEY,
    org_slug=ORG_SLUG,
    allow_writes=True,  # Enable batch creation
)

print("‚úì Client created with write permissions enabled")
print(f"Client configuration:")
print(f"  - API Key: {'*' * (len(API_KEY) - 8) + API_KEY[-8:] if len(API_KEY) > 8 else 'configured'}")
print(f"  - Organization: {ORG_SLUG}")
print(f"  - Base URL: {client.base_url}")
print(f"  - Allow writes: {client.allow_writes}")


‚úì Client created with write permissions enabled
Client configuration:
  - API Key: *******************vqvn4ji0
  - Organization: visa
  - Base URL: https://reprompt-mikhail--reprompt-fastapi-fastapi-app-dev.us-west.modal.run
  - Allow writes: True


In [5]:
# Create a batch from the DataFrame
try:
    print("Creating batch from DataFrame...")
    print(f"Submitting {len(batch_df)} restaurants for enrichment")

    # Create batch with the prepared DataFrame
    response = client.batches.create_from_dataframe(
        batch_df,
        batch_name="Seoul Restaurants Demo Batch",
        enrich_now=False, # Set to True to start enrichment immediately
        attribute_set=AttributeSet.core  # Use core attribute set
    )

    print("‚úÖ Batch created successfully!")
    print(f"Batch Details:")
    print(f"  - Batch ID: {response.id}")
    print(f"  - Batch Name: {response.batch_name}")
    print(f"  - Status: {response.status.value}")
    print(f"  - Jobs submitted: {len(response.jobs) if hasattr(response, 'jobs') and response.jobs else 'N/A'}")

except Exception as e:
    print(f"‚ùå Error creating batch: {e}")
    print(f"Error type: {type(e).__name__}")

    # Show what the error might be
    if "read-only" in str(e):
        print("\nüí° Tip: Make sure allow_writes=True when creating the client")
    elif "validation" in str(e).lower():
        print("\nüí° Tip: Check that your DataFrame has the required columns (name + location)")
        print("Current DataFrame columns:", list(batch_df.columns))
    elif "api" in str(e).lower():
        print("\nüí° Tip: Check your API key and organization slug are correct")


Creating batch from DataFrame...
Submitting 10 restaurants for enrichment
‚úÖ Batch created successfully!
Batch Details:
  - Batch ID: 47b1863d-b3a0-44d5-a115-70834e4f6c0a
  - Batch Name: Seoul Restaurants Demo Batch
  - Status: pending
  - Jobs submitted: 5


## Key Points for DataFrame Batch Creation

### Required DataFrame Format
Your DataFrame must have:
1. **`name`** column (required) - the place name
2. **One location specification**:
   - `full_address` (preferred), OR
   - Both `latitude` and `longitude`, OR  
   - All address components: `street`, `city`, `state`, `postal_code`, `country`

### Column Mapping
The implementation supports column name synonyms:
- `lat` or `latitude` ‚Üí `latitude`
- `lon`, `lng`, or `longitude` ‚Üí `longitude`  
- `zip`, `postalcode`, or `postal_code` ‚Üí `postal_code`

### Client Configuration
- **Read-only by default**: Set `allow_writes=True` to enable batch creation
- **Environment variables**: Can use `REPROMPT_API_KEY` and `REPROMPT_ORG_SLUG`

### Validation Features
- **Duplicate detection**: Automatically detects duplicate places
- **Data normalization**: Trims whitespace, handles empty values
- **Coordinate validation**: Ensures lat/lng are within valid ranges
- **Detailed error messages**: Shows specific row numbers and issues

### New: Batch Statistics
Use `client.batches.get_statistics()` to analyze enrichment results:
- **Aggregate analysis**: Combine statistics across multiple batches
- **Run rates**: See what percentage of jobs ran for each attribute  
- **Fill rates**: Monitor data quality by checking output completeness
- **Flexible output**: Get pandas DataFrame or dictionary format
- **Smart filtering**: Exclude NOT_RUN attributes for cleaner results

**Example Usage:**
```python
# Get statistics for one or more batches
stats = client.batches.get_statistics(
    batch_ids=["batch-id-1", "batch-id-2"],
    exclude_not_run=True,      # Hide unused attributes
    return_dataframe=True      # Get pandas DataFrame
)

# Analyze results
print(f"Average fill rate: {stats['value_fill_rate'].mean():.2%}")
```

### Best Practices
1. Test with a small subset first (like we did with 10 restaurants)
2. Always use try/except blocks for error handling
3. Monitor batch progress via `get_batch()` or the dashboard
4. Use `get_statistics()` to track enrichment quality and performance


In [9]:
# Get Statistics for the Batch
# This demonstrates the new get_statistics convenience method
# which aggregates attribute status and fill-rate statistics

batch_ids = [
    "c180e293-4c1f-495f-a94b-4e08a7e0a371",
    "69cb7ace-3761-4e95-bf09-6a7791d2a397",
    "1dc38020-8d71-48aa-b698-211a796e73d5",
    "74ae0cfc-6403-43f5-ba69-fcd45ce93498",
    "929273c1-850f-42a2-bbc8-60e530da2a7e"
]

print("üîç Getting batch statistics...")
print(f"Analyzing batch: {batch_ids}")

try:
    # Get comprehensive statistics for our batch
    stats_df = client.get_statistics(
        batch_ids=batch_ids,
        exclude_not_run=True,  # Hide attributes that were never run
        return_dataframe=True
    )

    print("‚úÖ Statistics retrieved successfully!")
    print(f"\nBatch Statistics for '{batch_ids}':")
    print("=" * 60)

    if len(stats_df) > 0:
        print(f"üìä Found {len(stats_df)} enriched attributes")

        # Convert rates to percentages for better readability
        display_df = stats_df.copy()
        display_df['run_rate_pct'] = (display_df['run_rate'] * 100).round(1)
        display_df['fill_rate_pct'] = (display_df['value_fill_rate'] * 100).round(1)

        # Select key columns for summary view
        summary_cols = ['attribute', 'total_jobs', 'run_count', 'run_rate_pct',
                       'value_filled', 'fill_rate_pct']
        summary_df = display_df[summary_cols]

        print(f"\nüìã Top Performing Attributes:")
        # Sort by fill rate and show top 10
        top_attributes = summary_df.sort_values('fill_rate_pct', ascending=False)
        print(top_attributes.head(10).to_string(index=False))

        print(f"\nüìä Overall Performance Summary:")
        print(f"  - Total Attributes: {len(stats_df)}")
        print(f"  - Average Run Rate: {stats_df['run_rate'].mean() * 100:.1f}%")
        print(f"  - Average Fill Rate: {stats_df['value_fill_rate'].mean() * 100:.1f}%")
        print(f"  - Best Performing: {top_attributes.iloc[0]['attribute']} ({top_attributes.iloc[0]['fill_rate_pct']:.1f}% fill)")

        # Show attributes with highest value counts
        print(f"\nüî¢ Most Frequently Filled Attributes:")
        by_count = summary_df.sort_values('value_filled', ascending=False)
        print(by_count[['attribute', 'value_filled', 'fill_rate_pct']].head(5).to_string(index=False))

        print(f"\nüìà Full Statistics DataFrame (first 15 rows):")
        print(summary_df.head(15))

        if len(stats_df) > 15:
            print(f"\n... and {len(stats_df) - 15} more attributes")
            print("üí° Tip: Use `stats_df.tail()` or `stats_df[stats_df['fill_rate_pct'] > 50]` to explore further")

    else:
        print("‚ÑπÔ∏è  No enriched attributes found (batch may still be processing)")

except Exception as e:
    print(f"‚ùå Error getting statistics: {e}")
    print("This might happen if:")
    print("  - The batch is still processing")
    print("  - The batch ID doesn't exist")
    print("  - There are no completed jobs yet")


üîç Getting batch statistics...
Analyzing batch: ['c180e293-4c1f-495f-a94b-4e08a7e0a371', '69cb7ace-3761-4e95-bf09-6a7791d2a397', '1dc38020-8d71-48aa-b698-211a796e73d5', '74ae0cfc-6403-43f5-ba69-fcd45ce93498', '929273c1-850f-42a2-bbc8-60e530da2a7e']
‚úÖ Statistics retrieved successfully!

Batch Statistics for '['c180e293-4c1f-495f-a94b-4e08a7e0a371', '69cb7ace-3761-4e95-bf09-6a7791d2a397', '1dc38020-8d71-48aa-b698-211a796e73d5', '74ae0cfc-6403-43f5-ba69-fcd45ce93498', '929273c1-850f-42a2-bbc8-60e530da2a7e']':
üìä Found 9 enriched attributes

üìã Top Performing Attributes:
         attribute  total_jobs  run_count  run_rate_pct  value_filled  fill_rate_pct
closed_permanently        4471       4469         100.0          4469          100.0
      openingHours        4471       4469         100.0          4469          100.0
        categories        4471       4469         100.0          4347           97.2
             names        4471       4469         100.0          4232         

In [14]:
# Advanced Statistics Usage Examples
# Demonstrating different ways to use the get_statistics method

print("üî¨ Advanced Statistics Examples")
print("=" * 40)

# Example 1: Compare multiple batches (if you have more than one)
print("\n1Ô∏è‚É£ Multi-batch Statistics (example)")
print("   Use case: Compare performance across different batches")
BATCH_ID_SET = [
    # "47b1863d-b3a0-44d5-a115-70834e4f6c0a",
    # "59b4ffd8-1c6b-466d-9435-00a80f8855aa",
    "9e8d2005-f0df-40f8-82ef-3421ac370ec6",
    # "525da27c-ab10-4ae5-892c-2563e17b3a22"
]

# Example 2: Include NOT_RUN attributes
print("\n2Ô∏è‚É£ Include NOT_RUN Attributes")
try:
    stats_with_not_run = client.get_statistics(
        batch_ids=BATCH_ID_SET,
        exclude_not_run=True,  # Include all attributes
        return_dataframe=True
    )
    print(f"   Total attributes (including NOT_RUN): {len(stats_with_not_run)}")

    if len(stats_with_not_run) > 0:
        not_run_attrs = stats_with_not_run[stats_with_not_run['run_count'] == 0]
        if len(not_run_attrs) > 0:
            print(f"   Attributes that were NOT_RUN: {list(not_run_attrs['attribute'])}")
        else:
            print("   All attributes were run at least once")

except Exception as e:
    print(f"   Error: {e}")

# Example 3: Return as dictionary instead of DataFrame
print("\n3Ô∏è‚É£ Dictionary Return Format")
try:
    stats_dict = client.get_statistics(
        batch_ids=BATCH_ID_SET,
        return_dataframe=False  # Return dict instead of DataFrame
    )

    if stats_dict and not stats_dict.get('error'):
        print(f"   Available attributes: {list(stats_dict.keys())}")
        # Show details for first attribute
        first_attr = list(stats_dict.keys())[0]
        attr_stats = stats_dict[first_attr]
        print(f"   Example - {first_attr}:")
        print(f"     ‚Ä¢ Total jobs: {attr_stats['total_jobs']}")
        print(f"     ‚Ä¢ Run count: {attr_stats['run_count']}")
        print(f"     ‚Ä¢ Fill rate: {attr_stats['value_fill_rate']:.2%}")
    else:
        print("   No statistics available or error occurred")

except Exception as e:
    print(f"   Error: {e}")

print("\nüí° Pro Tips:")
print("  - Use exclude_not_run=True for cleaner results")
print("  - Compare multiple batches to identify trends")
print("  - Monitor fill rates to assess data quality")
print("  - Use return_dataframe=False for programmatic access")


üî¨ Advanced Statistics Examples

1Ô∏è‚É£ Multi-batch Statistics (example)
   Use case: Compare performance across different batches

2Ô∏è‚É£ Include NOT_RUN Attributes
   Total attributes (including NOT_RUN): 15
   All attributes were run at least once

3Ô∏è‚É£ Dictionary Return Format
   Available attributes: ['address', 'alcohol_service', 'alcohol_service_and_types', 'approximate_user_reviews', 'building_condition', 'building_footprint', 'categories', 'chain', 'closed_permanently', 'coordinates', 'cuisine', 'delivery_or_takeout', 'email_address', 'entrances', 'foursquare_place', 'geometry', 'instagram_statistics', 'menu', 'merchant', 'naics', 'name_translations', 'names', 'one_line_summary', 'openingHours', 'orderFoodLinks', 'overture_place', 'parcel', 'parking_spaces', 'phoneNumbers', 'place_existence', 'placekey', 'price_tier', 'primary_turns', 'reprompt_id', 'school_geofence', 'search_aliases', 'signage', 'socialHandles', 'social_media_profile', 'storefrontImages', 'tiktok', 'tr