In [11]:
import json

# Load the CUAD dataset
with open('../data/CUADv1.json', 'r') as f:
    cuad_data = json.load(f)

# Check the structure
print(f"Keys in dataset: {cuad_data.keys()}")
print(f"Number of entries: {len(cuad_data['data'])}")

Keys in dataset: dict_keys(['version', 'data'])
Number of entries: 510


In [12]:
# Explore first entry
first_entry = cuad_data['data'][0]
print(f"Title: {first_entry['title']}")
print(f"\nNumber of paragraphs: {len(first_entry['paragraphs'])}")
print(f"\nFirst paragraph keys: {first_entry['paragraphs'][0].keys()}")

Title: LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT

Number of paragraphs: 1

First paragraph keys: dict_keys(['qas', 'context'])


In [13]:
# Look at the context and questions
first_para = first_entry['paragraphs'][0]

print(f"Contract text length: {len(first_para['context'])} characters")
print(f"\nNumber of questions (clause types): {len(first_para['qas'])}")
print(f"\nFirst question example:")
print(f"Question: {first_para['qas'][0]['question']}")
print(f"Answer: {first_para['qas'][0]['answers']}")

Contract text length: 54290 characters

Number of questions (clause types): 41

First question example:
Question: Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract
Answer: [{'text': 'DISTRIBUTOR AGREEMENT', 'answer_start': 44}]


In [15]:
# Count how many contracts have answers for each clause type
clause_counts = {}

for entry in cuad_data['data']:
    for para in entry['paragraphs']:
        for qa in para['qas']:
            question = qa['question']
            has_answer = len(qa['answers']) > 0
            
            if question not in clause_counts:
                clause_counts[question] = 0
            if has_answer:
                clause_counts[question] += 1

# Show top 10 most common clauses
sorted_clauses = sorted(clause_counts.items(), key=lambda x: x[1], reverse=True)
print("Top 10 most common clause types:\n")
for clause, count in sorted_clauses[:10]:
    print(f"{count} contracts: {clause}")


Top 10 most common clause types:

510 contracts: Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract
509 contracts: Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract
470 contracts: Highlight the parts (if any) of this contract related to "Agreement Date" that should be reviewed by a lawyer. Details: The date of the contract
437 contracts: Highlight the parts (if any) of this contract related to "Governing Law" that should be reviewed by a lawyer. Details: Which state/country's law governs the interpretation of the contract?
413 contracts: Highlight the parts (if any) of this contract related to "Expiration Date" that should be reviewed by a lawyer. Details: On what date will the contract's initial term expire?
390 contracts: Highlight the parts (if any) of this contract related to "Effe