# üìÑ Document AI Extractor

### AI-Powered Receipt Extraction & Route Optimization

---

**Stack:** Python ‚Ä¢ Google-GenAI ‚Ä¢ OR-Tools ‚Ä¢ Plotly ‚Ä¢ Folium

```
üìÑ Invoice PDF ‚Üí ü§ñ LLM Extraction ‚Üí üìç Geocoding ‚Üí üó∫Ô∏è Route Optimization
```

In [1]:
# Setup
import json
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import folium
import pandas as pd
import plotly.express as px
from dotenv import load_dotenv

from src.database import DatabaseManager
from src.extraction import extract_from_pdf, process_all_receipts, get_processing_summary

load_dotenv(project_root / ".env")

# Paths
RECEIPTS_DIR = project_root / "data" / "raw" / "receipts"
DB_PATH = project_root / "data" / "processed" / "delivery.db"
db = DatabaseManager(DB_PATH)

print("Ready!")

Ready!


## 1. Extract Data from Sales Receipts

Using **Gemini 2.5 Flash** to parse variable-format PDF receipts into structured data.

In [2]:
# Find and extract from a sample PDF
pdf_files = list(RECEIPTS_DIR.glob("*.pdf"))
print("=" * 50)
print(f"FOUND {len(pdf_files)} RECEIPTS TO PROCESS")
print("=" * 50)

# Extract from first PDF
if pdf_files:
    sample_pdf = pdf_files[0]
    print(f"\nExtracting: {sample_pdf.name}")
    extraction = extract_from_pdf(sample_pdf)
    print(f"Confidence: {extraction.extraction_confidence:.0%}")

FOUND 4 RECEIPTS TO PROCESS

Extracting: Invoice_Coffee_Shop.pdf
Confidence: 98%


In [3]:
# Display extracted data
if extraction:
    print("=" * 50)
    print("CLIENT INFORMATION")
    print("=" * 50)
    print(f"Business: {extraction.client.business_name}")
    print(f"Tax ID:   {extraction.client.tax_id}")
    print(f"Delivery: {extraction.client.delivery_address}")
    
    print("\n" + "=" * 50)
    print(f"ORDER #{extraction.document.document_number}")
    print("=" * 50)
    print(f"Date: {extraction.document.issue_date}")
    
    print("\n" + "-" * 50)
    print("ITEMS")
    print("-" * 50)
    items_df = pd.DataFrame([{
        "Bag Type": item.bag_type_normalized.value.upper(),
        "Quantity": f"{item.quantity_packs} packs"
    } for item in extraction.items])
    display(items_df)

CLIENT INFORMATION
Business: O'connor Coffee Shop
Tax ID:   20-43213567-9
Delivery: Av. Regimiento de Patricios 1030, CABA

ORDER #00002289
Date: 2026-01-10

--------------------------------------------------
ITEMS
--------------------------------------------------


Unnamed: 0,Bag Type,Quantity
0,MEDIUM,160 packs
1,MEDIUM,160 packs


## 2Ô∏è. Batch Processing

Process all receipts ‚Üí Geocode addresses ‚Üí Store in SQLite

In [4]:
# Process all receipts
if pdf_files:
    with db.get_session() as session:
        results = process_all_receipts(RECEIPTS_DIR, session)
    
    summary = get_processing_summary(results)
    print("\n" + "=" * 50)
    print("BATCH PROCESSING RESULTS")
    print("=" * 50)
    print(f"Successful: {summary['successful']}")
    print(f"Duplicates: {summary['duplicates']}")
    print(f"Failed:     {summary['failed']}")
    print(f"Total time: {summary['total_processing_time_seconds']:.1f}s")

Processing: Invoice_Coffee_Shop.pdf
  ‚úì - Confidence: 0.98 | New client created
Processing: Invoice_FMart√≠nez.pdf
  ‚úì - Confidence: 0.98 | New client created
Processing: Invoice_Toy_Store.pdf
  ‚úì - Confidence: 0.95 | New client created
Processing: Receipt_El_Gaucho.pdf
  ‚úì - Confidence: 0.88 | Existing client matched

BATCH PROCESSING RESULTS
Successful: 4
Duplicates: 0
Failed:     0
Total time: 66.9s


## 3. Analytics Dashboard

In [5]:
# Load data
with db.get_session() as session:
    orders_df = pd.read_sql("""
        SELECT o.*, c.business_name
        FROM orders o
        LEFT JOIN clients c ON o.client_id = c.client_id
    """, session.bind)
    clients_df = pd.read_sql("SELECT * FROM clients", session.bind)
    order_items_df = pd.read_sql("""
        SELECT oi.order_id, oi.quantity_packs, oi.pallets, 
               o.delivery_zone_id
        FROM order_items oi
        JOIN orders o ON oi.order_id = o.order_id
    """, session.bind)

# Zone colors
with open(project_root / "data" / "geo" / "zones.json") as f:
    zones = json.load(f)
zone_colors = {z: info["color"] for z, info in zones.items()}

print(f"Orders: {len(orders_df)} | Clients: {len(clients_df)}")

Orders: 54 | Clients: 33


In [6]:
# Chart: Orders by Zone
by_zone = order_items_df.groupby("delivery_zone_id").agg(
    packs=("quantity_packs", "sum"),
    pallets=("pallets", "sum")
).reset_index()

# Count unique orders per zone
order_counts = orders_df.groupby("delivery_zone_id").size().reset_index(name="orders")
by_zone = by_zone.merge(order_counts, on="delivery_zone_id", how="left")

fig = px.bar(
    by_zone, x="delivery_zone_id", y="orders",
    color="delivery_zone_id", color_discrete_map=zone_colors,
    title="Orders by Delivery Zone",
    labels={"delivery_zone_id": "Zone", "orders": "Orders"},
    text="orders"
)
fig.update_traces(textposition="outside")
fig.update_layout(showlegend=False, template="plotly_white")
fig.show()

In [7]:
# Chart: Pallet Volume by Zone (Treemap)
fig = px.treemap(
    by_zone, path=["delivery_zone_id"], values="packs",
    color="delivery_zone_id", color_discrete_map=zone_colors,
    title="Bags Volume Distribution"
)
fig.update_traces(
    textinfo="label+value",
    texttemplate="%{label}<br>%{value:.0f} packs<br>(%{customdata:.1f} pallets)",
    customdata=by_zone["pallets"]
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()

In [8]:
# Chart: Client Types
client_stats = pd.DataFrame({
    "Type": ["Star Clients", "New Clients", "Regular"],
    "Count": [
        clients_df["is_star_client"].sum(),
        clients_df["is_new_client"].sum(),
        len(clients_df) - clients_df["is_star_client"].sum() - clients_df["is_new_client"].sum()
    ]
})

fig = px.pie(
    client_stats, values="Count", names="Type",
    title="Clients distribution",
    color_discrete_sequence=["#2ecc71", "#3498db", "#FFD700"],
    hole=0.4
)
fig.update_traces(textposition="inside", textinfo="percent+label")
fig.show()

## 4. Delivery Map

Interactive map showing factory depot and all delivery locations by zone.

In [9]:
# Factory location
DEPOT = (-34.7323, -58.2959)

# Create map
m = folium.Map(location=DEPOT, zoom_start=11, tiles="cartodbpositron")

# Depot marker
folium.Marker(
    DEPOT,
    popup="<b>üè≠ Eco-Bags Factory</b>",
    tooltip="Factory Depot",
    icon=folium.Icon(color="black", icon="industry", prefix="fa")
).add_to(m)

# Get all successfully extracted order IDs from this run
extracted_in_run = set()
if results:
    for result in results:
        if result.success and result.order_id:
            extracted_in_run.add(result.order_id)

print(f"Added {len(extracted_in_run)} orders extracted in this run")

# Delivery locations
pending = orders_df[orders_df["status"] == "pending"]

for _, order in pending.iterrows():
    if order["delivery_latitude"] and order["delivery_longitude"]:
        color = zone_colors.get(order["delivery_zone_id"], "#808080")
        is_new = order["order_id"] in extracted_in_run
        
        # Build popup with richer information
        client_name = order.get("contact_name") or order.get("business_name") or "Unknown"
        packs = int(order["quantity_packs"]) if order["quantity_packs"] else 0
        address = order.get("delivery_address", "N/A")
        
        popup_html = f"""
        <div style="width: 250px; font-family: Arial; font-size: 12px;">
            <b>{order['order_id']}</b><br>
            <hr style="margin: 5px 0;">
            <b>Client:</b> {client_name}<br>
            <b>Address:</b> {address}<br>
            <b>Quantity:</b> {packs} packs<br>
            <b>Zone:</b> {order['delivery_zone_id']}
        </div>
        """
        
        if is_new:
            # Orders extracted in this run - gold highlight
            folium.CircleMarker(
                [order["delivery_latitude"], order["delivery_longitude"]],
                radius=10,
                popup=folium.Popup(popup_html, max_width=280),
                tooltip=f"‚≠ê {order['order_id']}",
                color="#FFD700", fill=True, fill_color="#FFD700", fill_opacity=0.9, weight=2
            ).add_to(m)
        else:
            # Existing orders - zone colors
            folium.CircleMarker(
                [order["delivery_latitude"], order["delivery_longitude"]],
                radius=10,
                popup=folium.Popup(popup_html, max_width=280),
                color=color, fill=True, fill_color=color, fill_opacity=0.7
            ).add_to(m)

# Legend
extracted_count = len(extracted_in_run)
legend = f"""
<div style="position:fixed; bottom:50px; left:50px; background:white; 
            padding:15px; border-radius:8px; border:2px solid #666; z-index:1000;">
    <b style="font-size:14px;">üó∫Ô∏è Delivery Zones & Status</b><br><br>
    <span style="background:#FFD700; padding:3px 10px; border-radius:3px; font-weight:bold;">Extracted This Run ({extracted_count})</span><br><br>
    <span style="background:#FF6B6B; padding:3px 10px; border-radius:3px;">CABA</span><br><br>
    <span style="background:#4ECDC4; padding:3px 10px; border-radius:3px;">North</span><br><br>
    <span style="background:#45B7D1; padding:3px 10px; border-radius:3px;">South</span><br><br>
    <span style="background:#96CEB4; padding:3px 10px; border-radius:3px;">West</span>
</div>
"""
m.get_root().html.add_child(folium.Element(legend))

m

Added 4 orders extracted in this run


In [10]:
# Save the map
map_output_path = project_root / "output" / "maps" / "showcase_delivery_locations.html"
map_output_path.parent.mkdir(parents=True, exist_ok=True)

m.save(str(map_output_path))
print(f"Map saved to: {map_output_path}")
print(f"Orders shown: {len(pending)} total | {len(extracted_in_run)} newly extracted")

Map saved to: c:\Users\Santi\Desktop\CV\portafolio\Eco-Bags-Delivery-Optimizer\output\maps\showcase_delivery_locations.html
Orders shown: 41 total | 4 newly extracted
