# Scalable Contract Generator for AI Testing - COMPLETE IMPLEMENTATION

### Generates 200+ contracts with actual PDF/DOCX content and proper folder structure


In [2]:
# DEPENDENCIES
import os
import random
from datetime import datetime, timedelta
from pathlib import Path
from faker import Faker
from docx import Document
from docx.shared import Inches
from reportlab.lib.pagesizes import LETTER
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib import colors
from reportlab.lib.units import inch
import json

class ScalableContractGenerator:
    def __init__(self, total_contracts=200):
        self.fake = Faker()
        self.total_contracts = total_contracts
        self.base_dir = Path("../data/sample_data/")
        self.setup_folder_structure()
        
        # Expanded contract types
        self.contract_types = [
            'employment_agreement', 'commercial_lease', 'residential_lease',
            'business_partnership', 'consulting_services', 'loan_agreement', 
            'software_license', 'confidentiality_agreement', 'asset_purchase',
            'freelance_contract', 'equipment_lease', 'service_maintenance',
            'construction_contract', 'sales_agreement', 'distribution_agreement',
            'joint_venture', 'professional_services', 'technology_license',
            'marketing_agreement', 'subscription_service', 'independent_contractor',
            'non_compete_agreement', 'intellectual_property', 'project_management',
            'supply_agreement', 'franchise_agreement', 'merger_agreement',
            'severance_agreement', 'settlement_agreement', 'sponsorship_agreement'
        ]
        
        # Risk levels with distribution weights
        self.risk_levels = {
            'low_risk': 0.4,      # 40% of contracts
            'medium_risk': 0.35,   # 35% of contracts  
            'high_risk': 0.25      # 25% of contracts
        }
        
        # Document formats
        self.formats = ['pdf', 'docx']
        
        # Enhanced risk variations
        self.risk_clauses = self._initialize_risk_clauses()
        
        print(f"üéØ Initialized Scalable Generator for {total_contracts} contracts")
        print(f"üìÅ Dataset structure: {self.base_dir}")

    def setup_folder_structure(self):
        """Create organized folder structure"""
        folders = [
            'contracts/pdf',
            'contracts/docx', 
            'annotations',
            'train_split',
            'test_split',
            'validation_split',
            'reports'
        ]
        
        for folder in folders:
            (self.base_dir / folder).mkdir(parents=True, exist_ok=True)
        
        print("‚úÖ Created organized folder structure")

    def _initialize_risk_clauses(self):
        """Initialize comprehensive risk clause variations"""
        return {
            'liability': {
                'low_risk': [
                    "Liability shall be limited to the amount paid under this agreement.",
                    "Total liability shall not exceed the contract value.",
                    "Liability is capped at direct damages only.",
                    "Maximum liability limited to insurance coverage amounts.",
                    "No liability for indirect or consequential damages."
                ],
                'medium_risk': [
                    "Liability limited to direct damages up to twice the contract value.",
                    "No liability for indirect damages except for gross negligence.",
                    "Liability capped at three times the annual contract value.",
                    "Limited liability for third-party claims.",
                    "Liability exclusions for force majeure events."
                ],
                'high_risk': [
                    "Unlimited liability for all damages arising from this agreement.",
                    "Parties assume full liability for all direct and consequential damages.",
                    "No limitation of liability applies to any claims under this agreement.",
                    "Liability includes punitive damages and all legal expenses.",
                    "Complete assumption of all risks and liabilities."
                ]
            },
            'termination': {
                'low_risk': [
                    "Either party may terminate with 30 days written notice for convenience.",
                    "Termination requires mutual agreement or material breach.",
                    "Parties may terminate for cause with cure period of 30 days.",
                    "Termination only for specified material breaches.",
                    "Mutual termination rights with notice periods."
                ],
                'medium_risk': [
                    "Termination permitted with 15 days notice for any reason.",
                    "Immediate termination for breach of payment terms.",
                    "Termination for convenience with 50% early termination fee.",
                    "Termination for insolvency or change of control.",
                    "Limited termination rights with penalties."
                ],
                'high_risk': [
                    "Either party may terminate immediately without cause.",
                    "Termination at will with no notice requirement.", 
                    "Immediate termination for any breach, however minor.",
                    "Unilateral termination rights without penalty.",
                    "Termination for subjective dissatisfaction."
                ]
            },
            'indemnification': {
                'low_risk': [
                    "Indemnification limited to third-party claims arising from negligence.",
                    "Mutual indemnification for intellectual property infringement.",
                    "Indemnification capped at contract value.",
                    "Proportional indemnification based on fault.",
                    "Standard indemnification for direct damages only."
                ],
                'medium_risk': [
                    "One-way indemnification favoring the client.",
                    "Indemnification includes legal fees and settlement costs.",
                    "Indemnification for all claims related to services provided.",
                    "Broad indemnification with some limitations.",
                    "Indemnification for regulatory violations."
                ],
                'high_risk': [
                    "Unlimited indemnification for all claims and damages.",
                    "Indemnification includes punitive damages and all legal expenses.",
                    "Broad indemnification covering all business activities.",
                    "Indemnification for all losses regardless of cause.",
                    "Complete hold harmless agreement."
                ]
            },
            'warranty': {
                'low_risk': [
                    "Warranties limited to those expressly stated in this agreement.",
                    "No implied warranties, including merchantability or fitness.",
                    "As-is basis with all faults, no additional warranties.",
                    "Limited warranty for workmanship and materials.",
                    "Standard industry warranties apply."
                ],
                'medium_risk': [
                    "Implied warranties limited to 90 days from effective date.",
                    "Warranties exclude normal wear and tear.",
                    "Limited warranty for specific components only.",
                    "Warranty limitations for consumable items.",
                    "Modified warranty terms with exceptions."
                ],
                'high_risk': [
                    "No warranties of any kind, express or implied.",
                    "Services provided 'as-is' without any performance guarantees.",
                    "All warranties disclaimed to maximum extent permitted by law.",
                    "No warranty of fitness for particular purpose.",
                    "Complete disclaimer of all representations and warranties."
                ]
            },
            'confidentiality': {
                'low_risk': [
                    "Confidentiality obligations survive for 2 years post-termination.",
                    "Standard confidentiality with reasonable protection measures.",
                    "Mutual confidentiality with standard exceptions.",
                    "Confidentiality for specifically marked information only.",
                    "Standard non-disclosure terms apply."
                ],
                'medium_risk': [
                    "Confidentiality perpetual for trade secrets, 5 years for other information.",
                    "Heightened confidentiality with specific security requirements.",
                    "One-way confidentiality favoring disclosing party.",
                    "Confidentiality for all business information shared.",
                    "Enhanced protection for sensitive data."
                ],
                'high_risk': [
                    "Perpetual confidentiality for all information.",
                    "No right to use residual knowledge or general skills.",
                    "Confidentiality extends to all business information regardless of marking.",
                    "Lifetime confidentiality obligations.",
                    "Complete prohibition on use of confidential information."
                ]
            }
        }

    def generate_contract_dataset(self):
        """Generate comprehensive contract dataset"""
        print(f"üöÄ Generating {self.total_contracts} contracts...")
        print("=" * 60)
        
        dataset_stats = {
            'total_contracts': 0,
            'by_risk_level': {'low_risk': 0, 'medium_risk': 0, 'high_risk': 0},
            'by_format': {'pdf': 0, 'docx': 0},
            'by_type': {}
        }
        
        contracts_per_risk = self._calculate_contract_distribution()
        
        for risk_level, count in contracts_per_risk.items():
            print(f"\nüìä Generating {count} {risk_level} contracts:")
            print("-" * 40)
            
            for i in range(count):
                contract_type = random.choice(self.contract_types)
                doc_format = random.choice(self.formats)
                
                try:
                    # Generate contract
                    contract_data = self._generate_contract_data(contract_type, risk_level)
                    
                    if doc_format == 'pdf':
                        file_path = self.generate_pdf_contract(contract_type, contract_data)
                    else:
                        file_path = self.generate_docx_contract(contract_type, contract_data)
                    
                    if file_path:
                        # Save annotation
                        self._save_annotation(contract_data, file_path)
                        
                        # Update stats
                        dataset_stats['total_contracts'] += 1
                        dataset_stats['by_risk_level'][risk_level] += 1
                        dataset_stats['by_format'][doc_format] += 1
                        dataset_stats['by_type'][contract_type] = dataset_stats['by_type'].get(contract_type, 0) + 1
                        
                        if dataset_stats['total_contracts'] % 50 == 0:
                            print(f"   ‚úÖ Progress: {dataset_stats['total_contracts']}/{self.total_contracts}")
                            
                except Exception as e:
                    print(f"   ‚ùå Error generating contract {i+1}: {e}")
                    continue
        
        # Create dataset splits
        self._create_dataset_splits()
        
        # Generate comprehensive report
        self._generate_dataset_report(dataset_stats)
        
        return dataset_stats

    def _calculate_contract_distribution(self):
        """Calculate how many contracts to generate for each risk level"""
        contracts_per_risk = {}
        for risk_level, weight in self.risk_levels.items():
            contracts_per_risk[risk_level] = int(self.total_contracts * weight)
        
        # Adjust for rounding
        total_allocated = sum(contracts_per_risk.values())
        if total_allocated < self.total_contracts:
            contracts_per_risk['medium_risk'] += (self.total_contracts - total_allocated)
        
        return contracts_per_risk

    def _generate_contract_data(self, contract_type, risk_level):
        """Generate comprehensive contract data with risk annotations"""
        base_data = {
            'contract_id': f"CT-{self.fake.unique.random_number(digits=8)}",
            'effective_date': self.fake.date_between(start_date='-30d', end_date='+30d').strftime('%B %d, %Y'),
            'execution_date': self.fake.date_between(start_date='-60d', end_date='-1d').strftime('%B %d, %Y'),
            'generation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'contract_type': contract_type,
            'risk_level': risk_level,
            'risk_annotations': {},
            'clauses_used': {},
            'parties': [],
            'metadata': {}
        }
        
        # Generate risk-appropriate clauses
        for clause_type in self.risk_clauses.keys():
            base_data['clauses_used'][clause_type] = random.choice(
                self.risk_clauses[clause_type][risk_level]
            )
        
        # Add risk annotations
        base_data['risk_annotations'] = self._calculate_risk_scores(risk_level)
        
        # Generate contract-specific content
        base_data.update(self._generate_contract_specific_data(contract_type))
        
        return base_data

    def _calculate_risk_scores(self, risk_level):
        """Calculate detailed risk scores"""
        risk_base_scores = {
            'low_risk': {
                'liability_risk': random.uniform(0.1, 0.3),
                'termination_risk': random.uniform(0.2, 0.4),
                'indemnification_risk': random.uniform(0.1, 0.3),
                'warranty_risk': random.uniform(0.3, 0.5),
                'confidentiality_risk': random.uniform(0.2, 0.4)
            },
            'medium_risk': {
                'liability_risk': random.uniform(0.4, 0.6),
                'termination_risk': random.uniform(0.5, 0.7),
                'indemnification_risk': random.uniform(0.6, 0.8),
                'warranty_risk': random.uniform(0.4, 0.6),
                'confidentiality_risk': random.uniform(0.5, 0.7)
            },
            'high_risk': {
                'liability_risk': random.uniform(0.7, 0.9),
                'termination_risk': random.uniform(0.6, 0.8),
                'indemnification_risk': random.uniform(0.7, 0.95),
                'warranty_risk': random.uniform(0.6, 0.8),
                'confidentiality_risk': random.uniform(0.7, 0.9)
            }
        }
        
        scores = risk_base_scores[risk_level].copy()
        scores['overall_risk'] = sum(scores.values()) / len(scores)
        
        return scores

    def _generate_contract_specific_data(self, contract_type):
        """Generate data specific to contract type"""
        if contract_type == 'employment_agreement':
            return self._generate_employment_data()
        elif 'lease' in contract_type:
            return self._generate_lease_data(contract_type)
        elif 'loan' in contract_type:
            return self._generate_loan_data()
        else:
            return self._generate_general_business_data(contract_type)

    def _generate_employment_data(self):
        """Generate employment agreement specific data"""
        company = f"{self.fake.company()} {random.choice(['Inc.', 'LLC', 'Corp.', 'Ltd.'])}"
        employee = self.fake.name()
        
        return {
            'company': company,
            'employee': employee,
            'position': random.choice([
                "Senior Software Engineer", "Marketing Director", "Financial Analyst",
                "Operations Manager", "Product Manager", "Sales Executive"
            ]),
            'department': random.choice(["Technology", "Marketing", "Finance", "Operations", "Sales"]),
            'salary': f"${random.randint(60000, 180000):,} per annum",
            'start_date': self.fake.date_between(start_date='+5d', end_date='+30d').strftime('%B %d, %Y'),
            'duration': random.choice(["One year", "Two years", "Three years", "At-will"]),
            'benefits': random.choice([
                "Standard health insurance and 401(k) matching",
                "Comprehensive benefits package including stock options",
                "Full medical, dental, vision, and retirement benefits"
            ]),
            'parties': [
                {'name': company, 'role': 'Employer', 'signatory': f"{self.fake.name()}, CEO"},
                {'name': employee, 'role': 'Employee', 'signatory': employee}
            ]
        }

    def _generate_lease_data(self, lease_type):
        """Generate lease agreement specific data"""
        landlord = f"{self.fake.company()} Properties"
        tenant = f"{self.fake.company()} {random.choice(['Retail', 'Office', 'Industrial', 'Commercial'])}"
        is_commercial = 'commercial' in lease_type
        
        return {
            'property_address': f"{random.randint(100, 999)} {random.choice(['Main', 'Broadway', 'Market', 'Commerce'])} Street, {self.fake.city()}, {self.fake.state_abbr()} {self.fake.zipcode()}",
            'property_type': "Commercial Space" if is_commercial else "Residential Property",
            'square_footage': f"{random.randint(800, 5000)} square feet",
            'landlord': landlord,
            'tenant': tenant,
            'monthly_rent': f"${random.randint(1500, 15000) if is_commercial else random.randint(1000, 5000):,}",
            'lease_term': f"{random.randint(12, 60)} months",
            'parties': [
                {'name': landlord, 'role': 'Landlord', 'signatory': f"{self.fake.name()}, Property Manager"},
                {'name': tenant, 'role': 'Tenant', 'signatory': f"{self.fake.name()}, President"}
            ]
        }

    def _generate_loan_data(self):
        """Generate loan agreement specific data"""
        lender = f"{self.fake.company()} {random.choice(['Bank', 'Credit', 'Financial', 'Capital'])}"
        borrower = self.fake.name()
        
        return {
            'lender': lender,
            'borrower': borrower,
            'loan_amount': f"${random.randint(25000, 500000):,}",
            'interest_rate': f"{random.uniform(3.5, 12.5):.2f}%",
            'term_months': random.randint(12, 84),
            'purpose': random.choice([
                "Business expansion and working capital",
                "Equipment purchase and facility upgrade", 
                "Debt consolidation and operational funding",
                "Real estate investment and development"
            ]),
            'parties': [
                {'name': lender, 'role': 'Lender', 'signatory': f"{self.fake.name()}, Vice President"},
                {'name': borrower, 'role': 'Borrower', 'signatory': borrower}
            ]
        }

    def _generate_general_business_data(self, contract_type):
        """Generate data for general business contracts"""
        party1 = f"{self.fake.company()} {random.choice(['Inc.', 'LLC', 'Corp.'])}"
        party2 = f"{self.fake.company()} {random.choice(['Solutions', 'Services', 'Group', 'Partners'])}"
        
        return {
            'party1': party1,
            'party2': party2,
            'agreement_purpose': f"Business collaboration for {contract_type.replace('_', ' ')}",
            'term': f"{random.randint(6, 36)} months",
            'parties': [
                {'name': party1, 'role': 'First Party', 'signatory': f"{self.fake.name()}, Authorized Signatory"},
                {'name': party2, 'role': 'Second Party', 'signatory': f"{self.fake.name()}, Authorized Signatory"}
            ]
        }

    def generate_docx_contract(self, contract_type, data):
        """Generate actual DOCX contract with content"""
        try:
            doc = Document()
            
            # Title
            title = doc.add_heading(f"{contract_type.replace('_', ' ').title()}", 0)
            doc.add_paragraph(f"Contract ID: {data['contract_id']}")
            doc.add_paragraph(f"Effective Date: {data['effective_date']}")
            doc.add_paragraph(f"Execution Date: {data['execution_date']}")
            doc.add_paragraph(f"Risk Level: {data['risk_level'].replace('_', ' ').title()}")
            
            doc.add_paragraph()  # Empty line
            
            # Parties Section
            doc.add_heading("PARTIES", level=1)
            for party in data['parties']:
                doc.add_paragraph(f"{party['role']}: {party['name']}")
            
            doc.add_paragraph()  # Empty line
            
            # Recitals
            doc.add_heading("RECITALS", level=1)
            doc.add_paragraph("WHEREAS, the Parties desire to enter into this Agreement to set forth the terms and conditions of their relationship;")
            doc.add_paragraph("WHEREAS, each Party has the requisite power and authority to enter into this Agreement;")
            doc.add_paragraph("WHEREAS, the Parties intend to be legally bound by the terms herein;")
            doc.add_paragraph("NOW, THEREFORE, in consideration of the mutual covenants contained herein, the Parties agree as follows:")
            
            doc.add_paragraph()  # Empty line
            
            # Agreement Terms
            doc.add_heading("AGREEMENT", level=1)
            
            # Contract-specific content
            if contract_type == 'employment_agreement':
                self._add_employment_content_docx(doc, data)
            elif 'lease' in contract_type:
                self._add_lease_content_docx(doc, data)
            elif contract_type == 'loan_agreement':
                self._add_loan_content_docx(doc, data)
            else:
                self._add_general_content_docx(doc, data)
            
            # Risk clauses
            self._add_risk_clauses_docx(doc, data)
            
            # Standard provisions
            self._add_standard_provisions_docx(doc, data)
            
            # Signature section
            self._add_signature_section_docx(doc, data)
            
            # Save document
            filename = self.base_dir / f"contracts/docx/{contract_type}_{data['risk_level']}_{data['contract_id']}.docx"
            doc.save(filename)
            
            return filename
            
        except Exception as e:
            print(f"‚ùå Error generating DOCX: {e}")
            return None

    def generate_pdf_contract(self, contract_type, data):
        """Generate actual PDF contract with content"""
        try:
            filename = self.base_dir / f"contracts/pdf/{contract_type}_{data['risk_level']}_{data['contract_id']}.pdf"
            doc = SimpleDocTemplate(str(filename), pagesize=LETTER, 
                                  topMargin=1*inch, bottomMargin=1*inch,
                                  leftMargin=1*inch, rightMargin=1*inch)
            styles = getSampleStyleSheet()
            story = []
            
            # Title
            title_style = ParagraphStyle(
                'CustomTitle',
                parent=styles['Heading1'],
                fontSize=16,
                spaceAfter=30,
                alignment=1,
                textColor=colors.darkblue
            )
            
            story.append(Paragraph(f"{contract_type.replace('_', ' ').title()}", title_style))
            story.append(Paragraph(f"Contract ID: {data['contract_id']}", styles["Normal"]))
            story.append(Paragraph(f"Effective Date: {data['effective_date']}", styles["Normal"]))
            story.append(Paragraph(f"Risk Level: {data['risk_level'].replace('_', ' ').title()}", styles["Normal"]))
            story.append(Spacer(1, 0.25*inch))
            
            # Parties
            story.append(Paragraph("PARTIES", styles["Heading2"]))
            for party in data['parties']:
                story.append(Paragraph(f"{party['role']}: {party['name']}", styles["Normal"]))
            
            story.append(Spacer(1, 0.2*inch))
            
            # Recitals
            story.append(Paragraph("RECITALS", styles["Heading2"]))
            recitals = [
                "WHEREAS, the Parties desire to enter into this Agreement to set forth the terms and conditions of their relationship;",
                "WHEREAS, each Party has the requisite power and authority to enter into this Agreement;",
                "WHEREAS, the Parties intend to be legally bound by the terms herein;",
                "NOW, THEREFORE, in consideration of the mutual covenants contained herein, the Parties agree as follows:"
            ]
            for recital in recitals:
                story.append(Paragraph(recital, styles["Normal"]))
            
            story.append(Spacer(1, 0.2*inch))
            
            # Agreement
            story.append(Paragraph("AGREEMENT", styles["Heading2"]))
            
            if contract_type == 'employment_agreement':
                self._add_employment_content_pdf(story, data, styles)
            elif 'lease' in contract_type:
                self._add_lease_content_pdf(story, data, styles)
            elif contract_type == 'loan_agreement':
                self._add_loan_content_pdf(story, data, styles)
            else:
                self._add_general_content_pdf(story, data, styles)
            
            # Risk clauses
            self._add_risk_clauses_pdf(story, data, styles)
            
            # Standard provisions
            self._add_standard_provisions_pdf(story, data, styles)
            
            # Signature section
            self._add_signature_section_pdf(story, data, styles)
            
            doc.build(story)
            return filename
            
        except Exception as e:
            print(f"‚ùå Error generating PDF: {e}")
            return None

    # DOCX Content Methods
    def _add_employment_content_docx(self, doc, data):
        doc.add_heading("EMPLOYMENT TERMS", level=2)
        doc.add_paragraph(f"Position: {data['position']}")
        doc.add_paragraph(f"Department: {data['department']}")
        doc.add_paragraph(f"Salary: {data['salary']}")
        doc.add_paragraph(f"Start Date: {data['start_date']}")
        doc.add_paragraph(f"Duration: {data['duration']}")
        doc.add_paragraph(f"Benefits: {data['benefits']}")

    def _add_lease_content_docx(self, doc, data):
        doc.add_heading("LEASE TERMS", level=2)
        doc.add_paragraph(f"Property Address: {data['property_address']}")
        doc.add_paragraph(f"Property Type: {data['property_type']}")
        doc.add_paragraph(f"Square Footage: {data['square_footage']}")
        doc.add_paragraph(f"Monthly Rent: {data['monthly_rent']}")
        doc.add_paragraph(f"Lease Term: {data['lease_term']}")

    def _add_loan_content_docx(self, doc, data):
        doc.add_heading("LOAN TERMS", level=2)
        doc.add_paragraph(f"Loan Amount: {data['loan_amount']}")
        doc.add_paragraph(f"Interest Rate: {data['interest_rate']}")
        doc.add_paragraph(f"Term: {data['term_months']} months")
        doc.add_paragraph(f"Purpose: {data['purpose']}")

    def _add_general_content_docx(self, doc, data):
        doc.add_heading("TERMS AND CONDITIONS", level=2)
        doc.add_paragraph(f"Agreement Purpose: {data['agreement_purpose']}")
        doc.add_paragraph(f"Term: {data['term']}")

    def _add_risk_clauses_docx(self, doc, data):
        doc.add_heading("STANDARD PROVISIONS", level=2)
        for clause_type, clause_text in data['clauses_used'].items():
            p = doc.add_paragraph()
            p.add_run(f"{clause_type.replace('_', ' ').title()}: ").bold = True
            p.add_run(clause_text)

    def _add_standard_provisions_docx(self, doc, data):
        doc.add_heading("ADDITIONAL PROVISIONS", level=2)
        provisions = [
            "This Agreement constitutes the entire understanding between the Parties.",
            "No modification shall be effective unless in writing signed by both Parties.",
            "The failure to enforce any provision shall not constitute a waiver.",
            "If any provision is invalid, the remaining provisions shall continue in effect.",
            "This Agreement may be executed in counterparts."
        ]
        for provision in provisions:
            doc.add_paragraph(f"‚Ä¢ {provision}", style='List Bullet')

    def _add_signature_section_docx(self, doc, data):
        doc.add_heading("IN WITNESS WHEREOF", level=2)
        doc.add_paragraph("The Parties have executed this Agreement as of the date first written above.")
        doc.add_paragraph()
        
        for party in data['parties']:
            doc.add_paragraph("_________________________")
            doc.add_paragraph(party['signatory'])
            doc.add_paragraph(party['role'])
            doc.add_paragraph(f"Date: _________________________")
            doc.add_paragraph()

    # PDF Content Methods
    def _add_employment_content_pdf(self, story, data, styles):
        story.append(Paragraph("EMPLOYMENT TERMS", styles["Heading2"]))
        story.append(Paragraph(f"Position: {data['position']}", styles["Normal"]))
        story.append(Paragraph(f"Department: {data['department']}", styles["Normal"]))
        story.append(Paragraph(f"Salary: {data['salary']}", styles["Normal"]))
        story.append(Spacer(1, 12))

    def _add_lease_content_pdf(self, story, data, styles):
        story.append(Paragraph("LEASE TERMS", styles["Heading2"]))
        story.append(Paragraph(f"Property Address: {data['property_address']}", styles["Normal"]))
        story.append(Paragraph(f"Monthly Rent: {data['monthly_rent']}", styles["Normal"]))
        story.append(Paragraph(f"Lease Term: {data['lease_term']}", styles["Normal"]))
        story.append(Spacer(1, 12))

    def _add_loan_content_pdf(self, story, data, styles):
        story.append(Paragraph("LOAN TERMS", styles["Heading2"]))
        story.append(Paragraph(f"Loan Amount: {data['loan_amount']}", styles["Normal"]))
        story.append(Paragraph(f"Interest Rate: {data['interest_rate']}", styles["Normal"]))
        story.append(Paragraph(f"Term: {data['term_months']} months", styles["Normal"]))
        story.append(Spacer(1, 12))

    def _add_general_content_pdf(self, story, data, styles):
        story.append(Paragraph("TERMS AND CONDITIONS", styles["Heading2"]))
        story.append(Paragraph(f"Agreement Purpose: {data['agreement_purpose']}", styles["Normal"]))
        story.append(Spacer(1, 12))

    def _add_risk_clauses_pdf(self, story, data, styles):
        story.append(Paragraph("STANDARD PROVISIONS", styles["Heading2"]))
        for clause_type, clause_text in data['clauses_used'].items():
            story.append(Paragraph(f"<b>{clause_type.replace('_', ' ').title()}:</b> {clause_text}", styles["Normal"]))
            story.append(Spacer(1, 6))

    def _add_standard_provisions_pdf(self, story, data, styles):
        story.append(Paragraph("ADDITIONAL PROVISIONS", styles["Heading2"]))
        provisions = [
            "This Agreement constitutes the entire understanding between the Parties.",
            "No modification shall be effective unless in writing signed by both Parties.",
            "The failure to enforce any provision shall not constitute a waiver.",
            "If any provision is invalid, the remaining provisions shall continue in effect."
        ]
        for provision in provisions:
            story.append(Paragraph(f"‚Ä¢ {provision}", styles["Normal"]))
            story.append(Spacer(1, 3))

    def _add_signature_section_pdf(self, story, data, styles):
        story.append(Spacer(1, 24))
        story.append(Paragraph("IN WITNESS WHEREOF", styles["Heading2"]))
        story.append(Paragraph("The Parties have executed this Agreement as of the date first written above.", styles["Normal"]))
        story.append(Spacer(1, 24))
        
        for party in data['parties']:
            story.append(Paragraph("_________________________", styles["Normal"]))
            story.append(Paragraph(party['signatory'], styles["Normal"]))
            story.append(Paragraph(party['role'], styles["Normal"]))
            story.append(Paragraph("Date: _________________________", styles["Normal"]))
            story.append(Spacer(1, 24))

    def _save_annotation(self, contract_data, file_path):
        """Save annotation JSON file"""
        annotation_data = {
            'contract_id': contract_data['contract_id'],
            'filename': file_path.name,
            'file_path': str(file_path),
            'contract_type': contract_data['contract_type'],
            'risk_level': contract_data['risk_level'],
            'risk_scores': contract_data['risk_annotations'],
            'clauses_used': contract_data['clauses_used'],
            'parties': contract_data['parties'],
            'generation_date': contract_data['generation_date'],
            'metadata': contract_data.get('metadata', {})
        }
        
        annotation_file = self.base_dir / f"annotations/{contract_data['contract_id']}.json"
        with open(annotation_file, 'w') as f:
            json.dump(annotation_data, f, indent=2)

    def _create_dataset_splits(self):
        """Create train/test/validation splits"""
        print("\nüìä Creating dataset splits...")
        
        # Get all annotation files
        annotation_files = list((self.base_dir / 'annotations').glob('*.json'))
        random.shuffle(annotation_files)
        
        # Split ratios
        train_ratio, test_ratio, val_ratio = 0.7, 0.2, 0.1
        n_total = len(annotation_files)
        
        n_train = int(n_total * train_ratio)
        n_test = int(n_total * test_ratio)
        n_val = n_total - n_train - n_test
        
        splits = {
            'train': annotation_files[:n_train],
            'test': annotation_files[n_train:n_train + n_test],
            'validation': annotation_files[n_train + n_test:]
        }
        
        # Create split files
        for split_name, files in splits.items():
            split_file = self.base_dir / f"{split_name}_split/split.json"
            file_list = [f.name for f in files]
            with open(split_file, 'w') as f:
                json.dump(file_list, f, indent=2)
            
            print(f"   ‚úÖ {split_name}: {len(files)} contracts")

    def _generate_dataset_report(self, stats):
        """Generate comprehensive dataset report"""
        report = {
            'generation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'total_contracts': stats['total_contracts'],
            'risk_distribution': stats['by_risk_level'],
            'format_distribution': stats['by_format'],
            'contract_type_distribution': stats['by_type'],
            'folder_structure': {
                'contracts_pdf': len(list((self.base_dir / 'contracts/pdf').glob('*.pdf'))),
                'contracts_docx': len(list((self.base_dir / 'contracts/docx').glob('*.docx'))),
                'annotations': len(list((self.base_dir / 'annotations').glob('*.json'))),
            }
        }
        
        report_file = self.base_dir / 'reports/dataset_report.json'
        with open(report_file, 'w') as f:
            json.dump(report, f, indent=2)
        
        # Print summary
        print("\n" + "=" * 60)
        print("üìä DATASET GENERATION COMPLETE!")
        print("=" * 60)
        print(f"üìÅ Dataset Location: {self.base_dir}")
        print(f"üìÑ Total Contracts: {stats['total_contracts']}")
        print(f"üéØ Risk Distribution:")
        for risk_level, count in stats['by_risk_level'].items():
            print(f"   ‚Ä¢ {risk_level}: {count} contracts")
        print(f"üìù Format Distribution:")
        for format_type, count in stats['by_format'].items():
            print(f"   ‚Ä¢ {format_type.upper()}: {count} contracts")
        print(f"üìë Contract Types: {len(stats['by_type'])} unique types")
        print(f"üîÄ Dataset Splits:")
        print(f"   ‚Ä¢ Training: 70%")
        print(f"   ‚Ä¢ Testing: 20%") 
        print(f"   ‚Ä¢ Validation: 10%")
        print(f"üìà Perfect for supervised AI training! üöÄ")

def main():
    """Main function to generate the dataset"""
    print("üéØ Scalable Contract Dataset Generator for AI Testing")
    print("=" * 60)
    
    # Generate 200 contracts for proper supervised learning
    generator = ScalableContractGenerator(total_contracts=200)
    
    # Generate the complete dataset
    dataset_stats = generator.generate_contract_dataset()
    
    print(f"\n‚úÖ Dataset ready for AI training!")
    print(f"üìç Location: {generator.base_dir}")

if __name__ == "__main__":
    main()
    

üéØ Scalable Contract Dataset Generator for AI Testing
‚úÖ Created organized folder structure
üéØ Initialized Scalable Generator for 200 contracts
üìÅ Dataset structure: ../data/sample_data
üöÄ Generating 200 contracts...

üìä Generating 80 low_risk contracts:
----------------------------------------
   ‚úÖ Progress: 50/200

üìä Generating 70 medium_risk contracts:
----------------------------------------
   ‚úÖ Progress: 100/200
   ‚úÖ Progress: 150/200

üìä Generating 50 high_risk contracts:
----------------------------------------
   ‚úÖ Progress: 200/200

üìä Creating dataset splits...
   ‚úÖ train: 140 contracts
   ‚úÖ test: 40 contracts
   ‚úÖ validation: 20 contracts

üìä DATASET GENERATION COMPLETE!
üìÅ Dataset Location: ../data/sample_data
üìÑ Total Contracts: 200
üéØ Risk Distribution:
   ‚Ä¢ low_risk: 80 contracts
   ‚Ä¢ medium_risk: 70 contracts
   ‚Ä¢ high_risk: 50 contracts
üìù Format Distribution:
   ‚Ä¢ PDF: 103 contracts
   ‚Ä¢ DOCX: 97 contracts
üìë Cont