In [43]:
import re
import json
from pypdf import PdfReader

PDF_PATH = "./Vistara_AirAsia_Master_License_Agreement_30PlusPages_Final.pdf"
OUTPUT_JSON = "agreement_articles.json"

# Regex to detect ARTICLE headings
ARTICLE_PATTERN = re.compile(r"(ARTICLE\s+\d+\s+—\s+.+)", re.IGNORECASE)

def extract_articles_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    
    articles = {}
    current_article = None
    buffer = []

    for page in reader.pages:
        text = page.extract_text()
        if not text:
            continue

        lines = text.split("\n")

        for line in lines:
            article_match = ARTICLE_PATTERN.match(line.strip())

            # If new ARTICLE detected
            if article_match:
                # Save previous article
                if current_article:
                    articles[current_article] = " ".join(buffer).strip()
                    buffer = []

                current_article = article_match.group(1).strip()
            else:
                if current_article:
                    buffer.append(line.strip())

    # Save last article
    if current_article:
        articles[current_article] = " ".join(buffer).strip()

    return articles





In [44]:
article_json = extract_articles_from_pdf(PDF_PATH)
article_json

{'ARTICLE 2 — OBLIGATIONS': '2.1 General Obligations. Each Party shall perform its obligations under this Agreement in good faith, exercising reasonable best efforts and acting in compliance with all applicable aviation, corporate, labor, and competition laws. The obligations contained herein are continuing in nature and shall apply throughout the Transition Period. 2.2 Asset and Route Transfer Obligations. AirAsia shall identify, document, and make available for transfer all aircraft, routes, landing slots, parking rights, and operational permissions approved for transition. Vistara shall assume operational control only upon certification of regulatory readiness and completion of applicable circuit milestones. 2.3 Transitional Cooperation. The Parties shall cooperate fully to ensure uninterrupted operations, including transfer of manuals, maintenance logs, crew rosters, training records, and safety documentation. No Party shall unreasonably withhold information required for integratio

In [4]:
if __name__ == "__main__":
    article_json = extract_articles_from_pdf(PDF_PATH)

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(article_json, f, indent=2, ensure_ascii=False)

    print(f"Extracted {len(article_json)} articles.")
    print(f"Saved to {OUTPUT_JSON}")

Extracted 8 articles.
Saved to agreement_articles.json


In [45]:
TOC = article_json.keys()
toc_list = list(TOC)
toc_list

['ARTICLE 2 — OBLIGATIONS',
 'ARTICLE 3 — CONFIDENTIALITY, PUBLIC',
 'ARTICLE 4 — INTELLECTUAL PROPERTY',
 'ARTICLE 5 — LIABILITY',
 'ARTICLE 6 — GOVERNANCE',
 'ARTICLE 7 — MILESTONES & PAYMENTS',
 'ARTICLE 8 — EXIT, TERMINATION & CONSEQUENCES',
 'ARTICLE 9 — AMENDMENTS, EVOLUTION & CHANGE']

## Method - 2

In [49]:
from pypdf import PdfReader

def extract_document_text(pdf_path: str) -> str:
    """
    Extracts full text from a PDF file and returns it as a single string.
    
    Args:
        pdf_path (str): Path to the PDF file
    
    Returns:
        str: Full document text
    """
    reader = PdfReader(pdf_path)
    pages_text = []

    for page_number, page in enumerate(reader.pages, start=1):
        text = page.extract_text()
        if text:
            # Normalize whitespace slightly (optional but helpful)
            cleaned_text = text.replace("\xa0", " ").strip()
            pages_text.append(cleaned_text)

    # Join pages with clear separation
    document_text = "\n\n".join(pages_text)
    return document_text


In [50]:
document_text = extract_document_text(PDF_PATH)

In [51]:
document_text

'ARTICLE 2 — OBLIGATIONS\n2.1 General Obligations. Each Party shall perform its obligations under this Agreement in good faith,\nexercising reasonable best efforts and acting in compliance with all applicable aviation, corporate, labor,\nand competition laws. The obligations contained herein are continuing in nature and shall apply\nthroughout the Transition Period.\n2.2 Asset and Route Transfer Obligations. AirAsia shall identify, document, and make available for\ntransfer all aircraft, routes, landing slots, parking rights, and operational permissions approved for\ntransition. Vistara shall assume operational control only upon certification of regulatory readiness and\ncompletion of applicable circuit milestones.\n2.3 Transitional Cooperation. The Parties shall cooperate fully to ensure uninterrupted operations,\nincluding transfer of manuals, maintenance logs, crew rosters, training records, and safety\ndocumentation. No Party shall unreasonably withhold information required for int

In [52]:
import re
import json

def build_article_json_from_toc(document_text: str, toc: list[str]) -> dict:
    article_json = {}

    # Escape titles for safe regex usage
    escaped_toc = [re.escape(title) for title in toc]

    for i, title in enumerate(toc):
        start_pattern = escaped_toc[i]

        # Determine end boundary
        if i + 1 < len(toc):
            end_pattern = escaped_toc[i + 1]
            pattern = re.compile(
                rf"{start_pattern}(.*?){end_pattern}",
                re.DOTALL | re.IGNORECASE
            )
        else:
            # Last article → till end of document
            pattern = re.compile(
                rf"{start_pattern}(.*)",
                re.DOTALL | re.IGNORECASE
            )

        match = pattern.search(document_text)

        if match:
            article_json[title] = match.group(1).strip()
        else:
            article_json[title] = ""  # safe fallback

    return article_json



In [53]:
article_json = build_article_json_from_toc(document_text, toc_list)


In [54]:
article_json 

{'ARTICLE 2 — OBLIGATIONS': '2.1 General Obligations. Each Party shall perform its obligations under this Agreement in good faith,\nexercising reasonable best efforts and acting in compliance with all applicable aviation, corporate, labor,\nand competition laws. The obligations contained herein are continuing in nature and shall apply\nthroughout the Transition Period.\n2.2 Asset and Route Transfer Obligations. AirAsia shall identify, document, and make available for\ntransfer all aircraft, routes, landing slots, parking rights, and operational permissions approved for\ntransition. Vistara shall assume operational control only upon certification of regulatory readiness and\ncompletion of applicable circuit milestones.\n2.3 Transitional Cooperation. The Parties shall cooperate fully to ensure uninterrupted operations,\nincluding transfer of manuals, maintenance logs, crew rosters, training records, and safety\ndocumentation. No Party shall unreasonably withhold information required for 

## Synopis Generation

In [None]:
GENERIC_SYNOPSIS_PROMPT = """
You are a legal analyst.

Your task is to write a concise, neutral synopsis of the given Article from a legal agreement.

Rules:
- Do NOT add new information
- Do NOT interpret intent beyond the text
- Do NOT use legal advice language
- Use clear, professional, business-friendly wording
- Keep the synopsis between 4–6 sentences
- Focus on purpose, scope, key obligations, and consequences

Article Name:
{article_name}

Article Text:
{article_text}

Return ONLY the synopsis text.
"""


from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

def generate_article_synopsis(article_name: str, article_text: str) -> str:
    prompt = GENERIC_SYNOPSIS_PROMPT.format(
        article_name=article_name,
        article_text=article_text
    )

    response = llm([HumanMessage(content=prompt)])
    return response.content.strip()


def generate_document_synopsis(article_json: dict) -> dict:
    synopsis_json = {}

    for article_name, article_text in article_json.items():
        if not article_text.strip():
            synopsis_json[article_name] = ""
            continue

        synopsis = generate_article_synopsis(
            article_name=article_name,
            article_text=article_text
        )

        synopsis_json[article_name] = synopsis

    return synopsis_json




In [26]:
# article_json already created earlier
synopsis_json = generate_document_synopsis(article_json)

import json
with open("agreement_synopsis.json", "w", encoding="utf-8") as f:
    json.dump(synopsis_json, f, indent=2, ensure_ascii=False)

print("Synopsis generation complete")

  response = llm([HumanMessage(content=prompt)])


Synopsis generation complete


## Synopsis Creation

In [55]:
def concatenate_article_synopses(synopsis_json: dict) -> str:
    """
    Concatenates article-wise synopses into a single text block
    with clear article separation.
    
    Args:
        synopsis_json (dict): {Article Name: Synopsis}
    
    Returns:
        str: Consolidated text
    """
    consolidated_text = []

    for article, synopsis in synopsis_json.items():
        if not synopsis.strip():
            continue

        consolidated_text.append(
            f"=== {article} ===\n{synopsis.strip()}"
        )

    return "\n\n".join(consolidated_text)
    
consolidated_synopses = concatenate_article_synopses(synopsis_json)


In [56]:
consolidated_synopses

'=== ARTICLE 2 — OBLIGATIONS ===\nArticle 2 outlines the obligations of both Parties during the Transition Period. Each Party is required to act in good faith and comply with relevant laws while fulfilling their ongoing responsibilities. AirAsia must facilitate the transfer of specified assets and operational controls to Vistara, which will only assume control after meeting regulatory requirements. The Article details the structured transition of employees, specifying eligibility criteria and compensation arrangements, as well as the need for compliance with Vistara\'s policies. Additionally, it establishes audit rights for Vistara and outlines consequences for non-performance, including the potential suspension of integration activities or termination of the Agreement.\n\n=== ARTICLE 3 — CONFIDENTIALITY, PUBLIC ===\nArticle 3 outlines the obligations related to confidentiality, public communications, ethical conduct, and insider trading in the context of a merger and its subsequent op

In [57]:
from IPython.display import display, Markdown

display(Markdown("## Consolidated Article Synopses"))
display(Markdown(consolidated_synopses))

## Consolidated Article Synopses

=== ARTICLE 2 — OBLIGATIONS ===
Article 2 outlines the obligations of both Parties during the Transition Period. Each Party is required to act in good faith and comply with relevant laws while fulfilling their ongoing responsibilities. AirAsia must facilitate the transfer of specified assets and operational controls to Vistara, which will only assume control after meeting regulatory requirements. The Article details the structured transition of employees, specifying eligibility criteria and compensation arrangements, as well as the need for compliance with Vistara's policies. Additionally, it establishes audit rights for Vistara and outlines consequences for non-performance, including the potential suspension of integration activities or termination of the Agreement.

=== ARTICLE 3 — CONFIDENTIALITY, PUBLIC ===
Article 3 outlines the obligations related to confidentiality, public communications, ethical conduct, and insider trading in the context of a merger and its subsequent operations. It defines "Confidential Information" and specifies exclusions, emphasizing that such information must only be used for the purposes of the agreement and not for personal gain. Access to this information is restricted on a need-to-know basis, and a structured framework for public communications is established, categorizing permissible, restricted, and prohibited actions. The article also addresses the handling of Material Non-Public Information (MNPI) and imposes insider trading restrictions during and after employment. Breaches of these obligations may lead to significant consequences, including termination and legal action, with confidentiality obligations surviving for five years post-termination.

=== ARTICLE 4 — INTELLECTUAL PROPERTY ===
Article 4 addresses the management of intellectual property (IP) related to the merger and subsequent operations. It establishes that each party retains ownership of their pre-existing IP while granting Vistara a non-exclusive, royalty-free license to use AirAsia's IP for specific operational purposes. Any IP created during the integration process will belong to Vistara, and AirAsia's branding will have limited use post-merger. The article also outlines the handling of operational data and imposes restrictions on AirAsia regarding the use of Vistara's IP. Additionally, it mandates that both parties notify each other of any suspected IP infringement, with Vistara holding the primary enforcement rights. The provisions of this article will remain in effect even after the agreement's termination.

=== ARTICLE 5 — LIABILITY ===
Article 5 outlines the allocation of liabilities and responsibilities between the Parties concerning pre-merger, transition, and post-merger activities. AirAsia retains responsibility for all liabilities incurred before the Effective Date, including employee claims and regulatory penalties. Vistara will not assume any liabilities related to vendors providing catering and food services, but will take on airport-related fees and charges associated with AirAsia operations. Employee claims will be managed based on the timing of their transition, with Vistara assuming responsibility for claims arising after the transition date. The Article also includes indemnification provisions for both Parties and establishes a claims process, with a limitation on liability for indirect damages, and specifies that these provisions will survive for seven years post-termination of the Agreement.

=== ARTICLE 6 — GOVERNANCE ===
Article 6 outlines the governance structure for the merged operations, detailing the composition of the Board of Directors, decision-making authority, and oversight mechanisms. The Board will consist of ten directors, with seven appointed by the Parent Company and three by AirAsia, and the Chairperson will be nominated by the Parent Company, holding a casting vote in case of ties. Directors will serve a two-year term, with specific matters requiring the affirmative approval of Parent Company nominees. The Article also establishes the formation of Board Committees, mandates the disclosure of conflicts of interest, and emphasizes compliance with relevant laws and regulations. Breaches of this Article are considered material, allowing for remedies, and governance provisions will survive termination to ensure the enforcement of decisions made during the Agreement's term.

=== ARTICLE 7 — MILESTONES & PAYMENTS ===
Article 7 outlines the milestone-driven payment structure between Vistara and AirAsia related to the phased transfer of routes, assets, and employee integration. Payments are contingent upon the successful completion of specified integration phases, which are divided into three circuits, and are released in five tranches based on the achievement of defined milestones. The completion of each phase must be certified by the Joint Integration Committee, and employee transitions must follow a specified order. Delays caused by either party may affect the timing of payments, but do not automatically accelerate payment obligations. Additionally, Vistara retains the right to audit relevant records, and all payments are subject to applicable tax withholdings.

=== ARTICLE 8 — EXIT, TERMINATION & CONSEQUENCES ===
Article 8 outlines the terms regarding the exit, termination, and consequences of the Agreement. It establishes that the Agreement commences on the Effective Date and continues until all integration activities and payments are completed, unless terminated for material breach or by Vistara or AirAsia under specified conditions. Vistara may exit the transaction, resulting in the cessation of employee transitions and forfeiture of unpaid payment tranches, while AirAsia can exit only due to a material breach by Vistara, leading to forfeiture of unpaid tranches and board representation rights. The Article also clarifies that transferred assets and routes do not revert upon termination, and payments made prior to termination are non-refundable, except in cases of fraud or willful misconduct. Confidentiality and intellectual property obligations will survive termination, and the Parties are required to cooperate with regulators for an orderly transition or unwinding of operations.

=== ARTICLE 9 — AMENDMENTS, EVOLUTION & CHANGE ===
Article 9 outlines the evolution and amendment process of the Agreement, serving as a reference for interpretation and analysis. It details the incremental introduction of amendments aimed at clarifying scope, reallocating risk, and codifying operational decisions across three drafting rounds. Each round established specific principles and obligations, impacting various articles related to employee policies, confidentiality, operational structuring, and governance. The Article emphasizes that amendments do not imply waivers of rights and mandates that future changes must be documented in writing and approved according to governance provisions. Additionally, it clarifies that this Article will remain effective even after the Agreement's termination.

In [37]:
def build_prompt(consolidated_markdown: str) -> str:
    return f"""
You are a legal analyst refining a consolidated synopsis of a legal agreement.

Tasks:
1. Identify key thematic areas discussed
2. Identify Articles that should be read together
3. Produce a refined, high-level synopsis

Rules:
- Do NOT add new information
- Use ONLY the provided content
- Be neutral and factual
- Output VALID JSON ONLY
- No explanations, no markdown

Required JSON structure:
{{
  "key_areas_discussed": [
    {{
      "area": "string",
      "description": "string",
      "related_articles": ["ARTICLE X — TITLE"]
    }}
  ],
  "clauses_to_read_together": [
    {{
      "theme": "string",
      "articles": ["ARTICLE X — TITLE"],
      "reason": "string"
    }}
  ],
  "overall_refined_synopsis": "string"
}}

Input:
{consolidated_markdown}
"""

# 4️⃣ SAFE JSON extraction (THIS IS THE FIX)
def extract_json(text: str) -> dict:
    """
    Extracts the first JSON object found in text.
    """
    match = re.search(r"\{[\s\S]*\}", text)
    if not match:
        raise ValueError("❌ No JSON object found in LLM output")
    return json.loads(match.group(0))

# 5️⃣ Refinement function
def refine_consolidated_synopsis(consolidated_markdown: str) -> dict:
    prompt = build_prompt(consolidated_markdown)
    response = llm([HumanMessage(content=prompt)])
    return extract_json(response.content)

# 6️⃣ Run test
refined_output = refine_consolidated_synopsis(consolidated_synopses)


In [41]:
print(refined_output)

{'key_areas_discussed': [{'area': 'Obligations and Responsibilities', 'description': 'Details the obligations of both Parties during the Transition Period, including good faith actions, asset transfers, employee transitions, and compliance requirements.', 'related_articles': ['ARTICLE 2 — OBLIGATIONS', 'ARTICLE 5 — LIABILITY', 'ARTICLE 7 — MILESTONES & PAYMENTS']}, {'area': 'Confidentiality and Ethical Conduct', 'description': 'Outlines the obligations related to confidentiality, public communications, and insider trading, emphasizing the handling of confidential information and consequences for breaches.', 'related_articles': ['ARTICLE 3 — CONFIDENTIALITY, PUBLIC', 'ARTICLE 8 — EXIT, TERMINATION & CONSEQUENCES']}, {'area': 'Intellectual Property Management', 'description': 'Addresses the ownership and licensing of intellectual property, including the rights and responsibilities of both Parties regarding pre-existing and newly created IP.', 'related_articles': ['ARTICLE 4 — INTELLECTUA

In [42]:
print(json.dumps(refined_output, indent=2))

{
  "key_areas_discussed": [
    {
      "area": "Obligations and Responsibilities",
      "description": "Details the obligations of both Parties during the Transition Period, including good faith actions, asset transfers, employee transitions, and compliance requirements.",
      "related_articles": [
        "ARTICLE 2 \u2014 OBLIGATIONS",
        "ARTICLE 5 \u2014 LIABILITY",
        "ARTICLE 7 \u2014 MILESTONES & PAYMENTS"
      ]
    },
    {
      "area": "Confidentiality and Ethical Conduct",
      "description": "Outlines the obligations related to confidentiality, public communications, and insider trading, emphasizing the handling of confidential information and consequences for breaches.",
      "related_articles": [
        "ARTICLE 3 \u2014 CONFIDENTIALITY, PUBLIC",
        "ARTICLE 8 \u2014 EXIT, TERMINATION & CONSEQUENCES"
      ]
    },
    {
      "area": "Intellectual Property Management",
      "description": "Addresses the ownership and licensing of intellectual pr