In [1]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
import os
import json
from pathlib import Path
from dotenv import load_dotenv

In [2]:
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

llm = ChatGoogleGenerativeAI(
    model = 'gemini-2.0-flash'
)

In [3]:
document = Path("documents/sample-contract.md").read_text(encoding="utf-8")

In [4]:
json_schema = """
{
            "type": "object",
            "properties": {
                "document_info": {
                    "type": "object",
                    "properties": {
                        "document_type": {"type": "string"},
                        "contract_number": {"type": ["string", "null"]},
                        "date": {"type": ["string", "null"]},
                        "jurisdiction": {"type": ["string", "null"]},
                        "language": {"type": ["string", "null"]},
                        "pages": {"type": ["integer", "null"]}
                    },
                    "required": ["document_type"]
                },
                "parties": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "party_name": {"type": "string"},
                            "party_type": {
                                "type": "string", 
                                "enum": ["client", "vendor", "service_provider", "contractor", "employee", "employer", "buyer", "seller"]
                            },
                            "country": {"type": ["string", "null"]},
                            "address": {"type": ["string", "null"]},
                            "representative": {"type": ["string", "null"]},
                            "registration_id": {"type": ["string", "null"]}
                        },
                        "required": ["party_name", "party_type"]
                    }
                },
                "financial": {
                    "type": "object",
                    "properties": {
                        "total_value": {"type": ["number", "null"]},
                        "currency": {"type": ["string", "null"]},
                        "payment_terms": {"type": ["string", "null"]},
                        "payment_schedule": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "phase": {"type": "string"},
                                    "amount": {"type": "number"},
                                    "percentage": {"type": ["number", "null"]},
                                    "due_date": {"type": ["string", "null"]}
                                }
                            }
                        }
                    }
                },
                "timeline": {
                    "type": "object",
                    "properties": {
                        "start_date": {"type": ["string", "null"], "format": "date"},
                        "end_date": {"type": ["string", "null"], "format": "date"},
                        "duration": {"type": ["string", "null"]},
                        "key_milestones": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "milestone": {"type": "string"},
                                    "due_date": {"type": ["string", "null"]},
                                    "description": {"type": ["string", "null"]}
                                }
                            }
                        }
                    }
                },
                "services": {
                    "type": "object",
                    "properties": {
                        "description": {"type": ["string", "null"]},
                        "deliverables": {
                            "type": "array",
                            "items": {"type": "string"}
                        },
                        "scope": {"type": ["string", "null"]},
                        "phases": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    }
                },
                "legal_clauses": {
                    "type": "object",
                    "properties": {
                        "confidentiality": {
                            "type": "object",
                            "properties": {
                                "present": {"type": "boolean"},
                                "duration": {"type": ["string", "null"]}
                            }
                        },
                        "intellectual_property": {"type": ["string", "null"]},
                        "termination_clauses": {"type": ["string", "null"]},
                        "liability_limitations": {"type": ["string", "null"]},
                        "dispute_resolution": {"type": ["string", "null"]}
                    }
                },
                "risk_assessment": {
                    "type": "object",
                    "properties": {
                        "complexity_level": {
                            "type": "string",
                            "enum": ["low", "medium", "high"]
                        },
                        "key_risks": {
                            "type": "array",
                            "items": {"type": "string"}
                        },
                        "compliance_requirements": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    }
                }
            },
            "required": ["document_info", "parties"]
        }
"""

In [5]:
template = """You are an expert legal document analyzer. Extract comprehensive metadata from contracts into JSON format.

Follow this JSON schema structure:
{json_schema}

Extract these essential fields:

1. **Document Info**: 
   - document_type, contract_number, date, jurisdiction, language, pages

2. **Parties**: 
   - party_name, party_type (client/vendor/service_provider/contractor/employee/employer/buyer/seller)
   - country, address, representative, registration_id

3. **Financial**: 
   - total_value (number), currency, payment_terms
   - payment_schedule with phases, amounts, percentages, due_dates

4. **Timeline**: 
   - start_date (YYYY-MM-DD), end_date (YYYY-MM-DD), duration
   - key_milestones with milestone names, due_dates, descriptions

5. **Services**: 
   - description, deliverables (array), scope, phases

6. **Legal Clauses**:
   - confidentiality (present: boolean, duration)
   - intellectual_property, termination_clauses, liability_limitations, dispute_resolution

7. **Risk Assessment**:
   - complexity_level (low/medium/high)
   - key_risks (array), compliance_requirements (array)

**Important Instructions**: 
 - Return ONLY raw JSON - NO explanations, NO markdown formatting, NO ```json``` blocks
 - Do NOT include any introductory text, closing remarks, or commentary
 - Output must start with {{ and end with }}
 - Use null for missing data (not "null" as string)
 - Dates must be in YYYY-MM-DD format
 - party_type must be one of the enum values: client/vendor/service_provider/contractor/employee/employer/buyer/seller
 - Be precise with numbers and dates
 - Extract all available information comprehensively
 - INVALID: Any text before or after the JSON object

Document to analyze:
{document}

JSON Output:"""

In [6]:
prompt_template = PromptTemplate(
    template = template,
    input_variables = ['document','json_schema']
)

In [7]:
formatted_prompt = prompt_template.format(
    document=document,
    json_schema=json_schema
)

In [8]:
print(formatted_prompt)

You are an expert legal document analyzer. Extract comprehensive metadata from contracts into JSON format.

Follow this JSON schema structure:

{
            "type": "object",
            "properties": {
                "document_info": {
                    "type": "object",
                    "properties": {
                        "document_type": {"type": "string"},
                        "contract_number": {"type": ["string", "null"]},
                        "date": {"type": ["string", "null"]},
                        "jurisdiction": {"type": ["string", "null"]},
                        "language": {"type": ["string", "null"]},
                        "pages": {"type": ["integer", "null"]}
                    },
                    "required": ["document_type"]
                },
                "parties": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                     

In [9]:
response = llm.invoke(formatted_prompt)

In [12]:
print(response.content)

```json
{
    "document_info": {
        "document_type": "Service Agreement",
        "contract_number": "SA-2024-001",
        "date": "2024-03-15",
        "jurisdiction": "Singapore",
        "language": null,
        "pages": 10
    },
    "parties": [
        {
            "party_name": "PT. Nusantara Data",
            "party_type": "client",
            "country": "Indonesia",
            "address": "Jl. Sudirman Kav. 52-53, Jakarta 12190, Indonesia",
            "representative": "Rizki Ghani",
            "registration_id": "01.234.567.8-901.000"
        },
        {
            "party_name": "Alpha Consulting Pte. Ltd.",
            "party_type": "service_provider",
            "country": "Singapore",
            "address": "1 Marina Bay, Singapore 018989",
            "representative": "Laura Tan",
            "registration_id": "201234567G"
        }
    ],
    "financial": {
        "total_value": 85000,
        "currency": "USD",
        "payment_terms": "Invoices shall 

In [13]:
metadata = json.loads(response.content)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [14]:
from json_extractor import JsonExtractor

In [15]:
metadata = JsonExtractor.extract_valid_json(response.content)

In [16]:
metadata

{'document_info': {'document_type': 'Service Agreement',
  'contract_number': 'SA-2024-001',
  'date': '2024-03-15',
  'jurisdiction': 'Singapore',
  'language': None,
  'pages': 10},
 'parties': [{'party_name': 'PT. Nusantara Data',
   'party_type': 'client',
   'country': 'Indonesia',
   'address': 'Jl. Sudirman Kav. 52-53, Jakarta 12190, Indonesia',
   'representative': 'Rizki Ghani',
   'registration_id': '01.234.567.8-901.000'},
  {'party_name': 'Alpha Consulting Pte. Ltd.',
   'party_type': 'service_provider',
   'country': 'Singapore',
   'address': '1 Marina Bay, Singapore 018989',
   'representative': 'Laura Tan',
   'registration_id': '201234567G'}],
 'financial': {'total_value': 85000,
  'currency': 'USD',
  'payment_terms': 'Invoices shall be submitted upon completion of each phase\n- Payment due within 30 days of invoice receipt\n- All payments in USD via wire transfer\n- Late payments subject to 1.5% monthly interest charge',
  'payment_schedule': [{'phase': 'Phase 1 Comp