In [73]:
import json

# Load the CUAD dataset
with open('../data/CUADv1.json', 'r') as f:
    cuad_data = json.load(f)

# Check the structure
print(f"Keys in dataset: {cuad_data.keys()}")
print(f"Number of entries: {len(cuad_data['data'])}")

Keys in dataset: dict_keys(['version', 'data'])
Number of entries: 510


In [74]:
# Explore first entry
first_entry = cuad_data['data'][0]
print(f"Title: {first_entry['title']}")
print(f"\nNumber of paragraphs: {len(first_entry['paragraphs'])}")
print(f"\nFirst paragraph keys: {first_entry['paragraphs'][0].keys()}")

Title: LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT

Number of paragraphs: 1

First paragraph keys: dict_keys(['qas', 'context'])


In [75]:
# Look at the context and questions
first_para = first_entry['paragraphs'][0]

print(f"Contract text length: {len(first_para['context'])} characters")
print(f"\nNumber of questions (clause types): {len(first_para['qas'])}")
print(f"\nFirst question example:")
print(f"Question: {first_para['qas'][0]['question']}")
print(f"Answer: {first_para['qas'][0]['answers']}")

Contract text length: 54290 characters

Number of questions (clause types): 41

First question example:
Question: Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract
Answer: [{'text': 'DISTRIBUTOR AGREEMENT', 'answer_start': 44}]


In [76]:
# Count how many contracts have answers for each clause type
clause_counts = {}

for entry in cuad_data['data']:
    for para in entry['paragraphs']:
        for qa in para['qas']:
            question = qa['question']
            has_answer = len(qa['answers']) > 0
            
            if question not in clause_counts:
                clause_counts[question] = 0
            if has_answer:
                clause_counts[question] += 1

# Show top 10 most common clauses
sorted_clauses = sorted(clause_counts.items(), key=lambda x: x[1], reverse=True)
print("Top 10 most common clause types:\n")
for clause, count in sorted_clauses[:10]:
    print(f"{count} contracts: {clause}")


Top 10 most common clause types:

510 contracts: Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract
509 contracts: Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract
470 contracts: Highlight the parts (if any) of this contract related to "Agreement Date" that should be reviewed by a lawyer. Details: The date of the contract
437 contracts: Highlight the parts (if any) of this contract related to "Governing Law" that should be reviewed by a lawyer. Details: Which state/country's law governs the interpretation of the contract?
413 contracts: Highlight the parts (if any) of this contract related to "Expiration Date" that should be reviewed by a lawyer. Details: On what date will the contract's initial term expire?
390 contracts: Highlight the parts (if any) of this contract related to "Effe

In [87]:
# Define target clause types
target_clauses = [
    "Highlight the parts (if any) of this contract related to \"Governing Law\" that should be reviewed by a lawyer. Details: Which state/country's law governs the interpretation of the contract?",
    "Highlight the parts (if any) of this contract related to \"Expiration Date\" that should be reviewed by a lawyer. Details: On what date will the contract's initial term expire?",
    "Highlight the parts (if any) of this contract related to \"Effective Date\" that should be reviewed by a lawyer. Details: The date when the contract is effective ",
    "Highlight the parts (if any) of this contract related to \"Anti-Assignment\" that should be reviewed by a lawyer. Details: Is consent or notice required of a party if the contract is assigned to a third party?",
    "Highlight the parts (if any) of this contract related to \"Cap On Liability\" that should be reviewed by a lawyer. Details: Does the contract include a cap on liability upon the breach of a party’s obligation? This includes time limitation for the counterparty to bring claims or maximum amount for recovery.",
    "Highlight the parts (if any) of this contract related to \"License Grant\" that should be reviewed by a lawyer. Details: Does the contract contain a license granted by one party to its counterparty?"
]

print(f"Selected {len(target_clauses)} clause types for extraction")


Selected 6 clause types for extraction


In [97]:
# Extract data using partial matching on clause names
clause_names = ["Governing Law", "Expiration Date", "Effective Date", 
                "Anti-Assignment", "Cap On Liability", "License Grant"]

filtered_data = []

for entry in cuad_data['data']:
    contract_id = entry['title']
    for para in entry['paragraphs']:
        context = para['context']
        
        for qa in para['qas']:
            # Check if any of our clause names appears in the question
            for clause_name in clause_names:
                if f'"{clause_name}"' in qa['question']:
                    filtered_data.append({
                        'contract_id': contract_id,
                        'context': context,
                        'question': qa['question'],
                        'answers': qa['answers']
                    })
                    break  # Found this clause, move to next qa

print(f"Total examples: {len(filtered_data)}")


Total examples: 3060


In [98]:
# Count examples per clause type
from collections import Counter

clause_distribution = Counter([item['question'] for item in filtered_data])

print("Examples per clause type:\n")
for question, count in clause_distribution.items():
    # Extract just the clause name for readability
    clause_name = question.split('"')[1]
    print(f"{clause_name}: {count} examples")

Examples per clause type:

Effective Date: 510 examples
Expiration Date: 510 examples
Governing Law: 510 examples
Anti-Assignment: 510 examples
License Grant: 510 examples
Cap On Liability: 510 examples


In [99]:
# Count examples with vs without answers
has_answer = sum(1 for item in filtered_data if len(item['answers']) > 0)
no_answer = len(filtered_data) - has_answer

print(f"Examples WITH answers: {has_answer}")
print(f"Examples WITHOUT answers: {no_answer}")
print(f"Percentage with answers: {has_answer/len(filtered_data)*100:.1f}%")

Examples WITH answers: 2144
Examples WITHOUT answers: 916
Percentage with answers: 70.1%


In [101]:
# Show one example with an answer
example_with_answer = [item for item in filtered_data if len(item['answers']) > 0][0]

clause_name = example_with_answer['question'].split('"')[1]
answer_text = example_with_answer['answers'][0]['text']
answer_start = example_with_answer['answers'][0]['answer_start']

print(f"Clause: {clause_name}")
print(f"\nExtracted text: {answer_text}")
print(f"Position in contract: character {answer_start}")


Clause: Effective Date

Extracted text: The term of this  Agreement  shall be ten (10)                            years (the "Term")  which shall  commence on the date                            upon which the Company  delivers to  Distributor  the                            last Sample, as defined  hereinafter.
Position in contract: character 5268
