In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import json
import time
import pandas as pd
from pydantic import BaseModel, Field
from typing import Literal, get_args
from openpyxl import Workbook, load_workbook
import re
from bs4 import BeautifulSoup

# new SDK:
from google import genai
from google.genai import types

# progress bar
from tqdm import tqdm

# ─── CONFIG ───────────────────────────────────────────────────────────────────

GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError("Missing environment variable: GOOGLE_API_KEY")

client = genai.Client(api_key=GEMINI_API_KEY)

# ─── TYPES & PROMPT ───────────────────────────────────────────────────────────

SectorType = Literal[
    "Government/Public Sector", "Finance and Insurance", "Technology",
    "Telecommunications", "Real Estate", "Healthcare", "Retail",
    "Manufacturing", "Entertainment", "Education", "Energy",
    "Automotive", "Hospitality", "Transportation and Logistics",
    "Food and Beverage", "Nonprofit/NGO", "Agriculture", "Other"
]
JudgmentOutcomeType = Literal["Plaintiff", "Defendant", "Undecided"]

sector_list = ", ".join(get_args(SectorType))
system_prompt = (
    "You are a legal and business analyst.\n"
    "Analyze legal case texts and provide answers in JSON format with the keys \"sector\" and \"judgment_outcome\".\n\n"
    f"1. Identify which USA tertiary sector the case belongs to from the following list: {sector_list}.\n"
    "   If none apply, respond with \"Other\".\n\n"
    "2. Determine the judgment outcome: Plaintiff, Defendant, or Undecided.\n\n"
    "Respond only with a JSON object. Example:\n"
    "{\"sector\": \"Technology\", \"judgment_outcome\": \"Plaintiff\"}"
)

# ─── CLEANING ─────────────────────────────────────────────────────────────────

def clean_text(text: str) -> str:
    soup = BeautifulSoup(text, "lxml")
    return soup.get_text(separator=" ", strip=True)

# ─── MODEL OUTPUT PARSING ─────────────────────────────────────────────────────

class LegalAnalysisResult(BaseModel):
    sector: SectorType = Field(..., description="The USA tertiary sector the case belongs to")
    judgment_outcome: JudgmentOutcomeType = Field(..., description="Plaintiff, Defendant, or Undecided")

def extract_sector_and_outcome(case_text: str) -> LegalAnalysisResult | None:
    try:
        resp = client.models.generate_content(
            model="gemini-1.5-flash-latest",
            contents=[system_prompt, case_text],
            config=types.GenerateContentConfig(
                temperature=0.0,
                top_k=1,
                top_p=0.0,
                max_output_tokens=512,
                response_mime_type="application/json"
            )
        )
        return LegalAnalysisResult.parse_raw(resp.text)
    except Exception as e:
        if "RESOURCE_EXHAUSTED" in str(e):
            print("Rate limit hit. Sleeping for 30 seconds before retry...")
            time.sleep(30)
            return extract_sector_and_outcome(case_text)
        print(f"Gemini error processing case: {e}")
        return None

# ─── EXCEL UTIL ───────────────────────────────────────────────────────────────

def write_result_to_excel(file_path: str, row: list):
    if not os.path.exists(file_path):
        wb = Workbook()
        ws = wb.active
        ws.append(["id", "sector", "judgment_outcome"])
        wb.save(file_path)

    wb = load_workbook(file_path)
    ws = wb.active
    ws.append(row)
    wb.save(file_path)

# ─── MAIN ─────────────────────────────────────────────────────────────────────

def main():
    input_file = "/content/remaining.json"
    output_file = "/content/analysis(remaining).xlsx"

     # Handle JSONDecodeError:
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            cases = json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        # Attempt to fix the JSON:
        with open(input_file, "r", encoding="utf-8") as f:
            data = f.read()
            # Simple fix: replace unescaped newlines within strings:
            data = data.replace("\n", "\\n")
            try:
                cases = json.loads(data)  # Try to load the fixed data
            except json.JSONDecodeError as e2:
                print(f"Could not fix JSON: {e2}")
                return

    # Wrap with tqdm, show total, update every record
    for idx, case in enumerate(tqdm(cases, desc="Processing cases", unit="case"), start=1):
        cid = case.get("id")
        text_raw = case.get("plain_text", "")
        text = clean_text(text_raw)
        if not text:
            tqdm.write(f"Skipping {cid}: empty text")
            continue

        analysis = extract_sector_and_outcome(text)
        if analysis:
            tqdm.write(f"{cid} → Sector: {analysis.sector}, Outcome: {analysis.judgment_outcome}")
            write_result_to_excel(output_file, [cid, analysis.sector, analysis.judgment_outcome])
        else:
            tqdm.write(f"{cid} → No result")

        # Print a special notice every 500 records
        if idx % 500 == 0:
            tqdm.write(f"✅ {idx} records processed so far")

if __name__ == "__main__":
    main()
