In [1]:
import pandas as pd
import sqlite3
from collections import Counter
import json

In [3]:
# Load CUAD Master Clauses CSV
cuad_df = pd.read_csv("CUAD_v1/master_clauses.csv")

print(cuad_df.columns)

Index(['Filename', 'Document Name', 'Document Name-Answer', 'Parties',
       'Parties-Answer', 'Agreement Date', 'Agreement Date-Answer',
       'Effective Date', 'Effective Date-Answer', 'Expiration Date',
       'Expiration Date-Answer', 'Renewal Term', 'Renewal Term-Answer',
       'Notice Period To Terminate Renewal',
       'Notice Period To Terminate Renewal- Answer', 'Governing Law',
       'Governing Law-Answer', 'Most Favored Nation',
       'Most Favored Nation-Answer', 'Competitive Restriction Exception',
       'Competitive Restriction Exception-Answer', 'Non-Compete',
       'Non-Compete-Answer', 'Exclusivity', 'Exclusivity-Answer',
       'No-Solicit Of Customers', 'No-Solicit Of Customers-Answer',
       'No-Solicit Of Employees', 'No-Solicit Of Employees-Answer',
       'Non-Disparagement', 'Non-Disparagement-Answer',
       'Termination For Convenience', 'Termination For Convenience-Answer',
       'Rofr/Rofo/Rofn', 'Rofr/Rofo/Rofn-Answer', 'Change Of Control',
      

In [5]:
print(cuad_df.head())

                                            Filename  \
0  CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
1  EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...   
2  FulucaiProductionsLtd_20131223_10-Q_EX-10.9_83...   
3  GopageCorp_20140221_10-K_EX-10.1_8432966_EX-10...   
4  IdeanomicsInc_20160330_10-K_EX-10.26_9512211_E...   

                                    Document Name  \
0               ['MARKETING AFFILIATE AGREEMENT']   
1   ['VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT']   
2  ['CONTENT DISTRIBUTION AND LICENSE AGREEMENT']   
3           ['WEBSITE CONTENT LICENSE AGREEMENT']   
4                   ['CONTENT LICENSE AGREEMENT']   

                         Document Name-Answer  \
0               MARKETING AFFILIATE AGREEMENT   
1   VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT   
2  CONTENT DISTRIBUTION AND LICENSE AGREEMENT   
3           WEBSITE CONTENT LICENSE AGREEMENT   
4                   CONTENT LICENSE AGREEMENT   

                                             Part

In [7]:

# Load the JSON file (update path accordingly)
json_path = "CUAD_v1/CUAD_v1.json"

with open(json_path, "r") as file:
    data = json.load(file)

# Print the top-level keys
print("Top-level keys:", data.keys())

# Display structure of first entry
print(json.dumps(data["data"][0], indent=4))


Top-level keys: dict_keys(['version', 'data'])
{
    "title": "LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT",
    "paragraphs": [
        {
            "qas": [
                {
                    "answers": [
                        {
                            "text": "DISTRIBUTOR AGREEMENT",
                            "answer_start": 44
                        }
                    ],
                    "id": "LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT__Document Name",
                    "question": "Highlight the parts (if any) of this contract related to \"Document Name\" that should be reviewed by a lawyer. Details: The name of the contract",
                    "is_impossible": false
                },
                {
                    "answers": [
                        {
                            "text": "Distributor",
                            "answer_start": 244
                        },
                        {
                            "t

In [9]:
# Load the original JSON file
with open("CUAD_v1/CUAD_v1.json", "r", encoding="utf-8") as f:
    original_data = json.load(f)

# Check the structure of the loaded JSON
if isinstance(original_data, list):
    # Sort the list and add DocID
    for i, contract in enumerate(sorted(original_data, key=lambda x: x.get("title", "")), start=0):
        contract["DocID"] = i
    sorted_data = original_data

elif isinstance(original_data, dict):
    # Find the correct key that contains the list of documents
    for key, value in original_data.items():
        if isinstance(value, list):
            value.sort(key=lambda x: x.get("title", ""))
            for i, contract in enumerate(value, start=0):
                contract["DocID"] = i
                original_data[key] = value
            sorted_data = original_data
            break
    else:
        print("❌ Error: No list found in the JSON structure.")
        exit()

# Save the updated JSON with DocID
with open("sorted_MasterCUAD.json", "w", encoding="utf-8") as f:
    json.dump(sorted_data, f, indent=4)

print("✅ Sorting and DocID addition completed successfully!")


✅ Sorting and DocID addition completed successfully!


In [11]:

# Load the JSON file (update path accordingly)
json_path = "sorted_MasterCUAD.json"

with open(json_path, "r") as file:
    data = json.load(file)

# Print the top-level keys
print("Top-level keys:", data.keys())

# Display structure of first entry
print(json.dumps(data["data"][1], indent=4))

Top-level keys: dict_keys(['version', 'data'])
{
    "title": "ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT",
    "paragraphs": [
        {
            "qas": [
                {
                    "answers": [
                        {
                            "text": "Services Agreement",
                            "answer_start": 26176
                        }
                    ],
                    "id": "ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT__Document Name",
                    "question": "Highlight the parts (if any) of this contract related to \"Document Name\" that should be reviewed by a lawyer. Details: The name of the contract",
                    "is_impossible": false
                },
                {
                    "answers": [
                        {
                            "text": "Each of the foregoing parties is referred to herein as a \"Party\" and together as the \"Parties\".",
                            "answer_start": 676
 

In [None]:
# 🚀 Step 1: Load CSV File
csv_file_path = "CUAD_v1/master_clauses.csv"   # Update this if needed
csv_data = pd.read_csv(csv_file_path)

# 🚀 Step 2: Standardize Column Names
csv_data.columns = csv_data.columns.str.strip().str.lower().str.replace(" ", "_")

# 🚀 Step 3: Identify Key Column
expected_col = "filename"
if expected_col not in csv_data.columns:
    raise ValueError(f"CSV file is missing the expected '{expected_col}' column!")

# 🚀 Step 5: Normalize Filenames for Matching
csv_data["normalized_filename"] = csv_data["filename"]

# 🚀 Step 6: Group CSV Data by Filename and Convert to JSON Format
csv_json_structure = {"data": []}
count = 0
for filename, group in csv_data.groupby("normalized_filename"):
    contract_entry = {
        "DocID": count,
        "title": filename,
        "paragraphs": []
    }
    
    for _, row in group.iterrows():
        paragraph_entry = {
            "qas": [],
            "context": row["clause_text"] if "clause_text" in row else ""
        }
        
        # Add QAs dynamically
        for col in csv_data.columns:
            if col not in ["filename", "normalized_filename", "clause_text"]:
                answer_value = str(row[col]).strip()
                
                if answer_value in ["[]", "nan", "NaN", "", "None"]:
                    is_impossible = True
                    answers = []  # No valid answer
                else:
                    is_impossible = False
                    answers = [{"text": answer_value, "answer_start": 0}]

                question_entry = {
                    "id": f"{filename}__{col}",
                    "question": f"Highlight the parts (if any) of this contract related to \"{col}\".",
                    "is_impossible": is_impossible,
                    "answers": answers
                }
                paragraph_entry["qas"].append(question_entry)

        contract_entry["paragraphs"].append(paragraph_entry)
    
    csv_json_structure["data"].append(contract_entry)
    count+=1

# 🚀 Step 7: Save JSON
json_output_path = "converted_csv5.json"
with open(json_output_path, "w", encoding="utf-8") as f:
    json.dump(csv_json_structure, f, indent=4)

print(f"✅ CSV successfully converted and saved as {json_output_path}")

In [151]:
import json

# Load the JSON file (update path accordingly)
json_path = "converted_csv5.json"

with open(json_path, "r") as file:
    data = json.load(file)

# Print the top-level keys
print("Top-level keys:", data.keys())

# Display structure of first entry
print(json.dumps(data["data"][0], indent=4))

Top-level keys: dict_keys(['data'])
{
    "DocID": 0,
    "title": "2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.pdf",
    "paragraphs": [
        {
            "qas": [
                {
                    "id": "2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.pdf__document_name",
                    "question": "Highlight the parts (if any) of this contract related to \"document_name\".",
                    "is_impossible": false,
                    "answers": [
                        {
                            "text": "['CO-BRANDING AND ADVERTISING AGREEMENT']",
                            "answer_start": 0
                        }
                    ]
                },
                {
                    "id": "2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.pdf__document_name-answer",
                    "question": "High

In [13]:
import json

# Load the original JSON file
with open("sorted_with_is.json", "r", encoding="utf-8") as f:
    original_data = json.load(f)

# Load the converted JSON
with open("converted_csv5.json", "r", encoding="utf-8") as f:
    converted_data = json.load(f)

# Check if the loaded data contains the expected list
if isinstance(converted_data, dict):
    converted_data = converted_data.get("data", [])  # Adjust based on actual key

if isinstance(original_data, dict):
    original_data = original_data.get("data", [])  # Adjust based on actual key

# Create a dictionary for fast lookup using DocID
converted_qas_map = {doc.get("DocID", ""): doc.get("paragraphs", []) for doc in converted_data if "DocID" in doc}

# Iterate through original data and append matching `CSVqas`
for original_doc in original_data:
    doc_id = original_doc.get("DocID", "")  # Get DocID
    if doc_id and doc_id in converted_qas_map:
        for original_para in original_doc.get("paragraphs", []):
            # Find corresponding paragraph in converted data
            for converted_para in converted_qas_map[doc_id]:
                # Append converted QAs under 'CSVqas' key at the same level as 'qas'
                original_para["CSVqas"] = converted_para.get("qas", [])

# Save the merged output
with open("merged_outputv5.json", "w", encoding="utf-8") as f:
    json.dump({"documents": original_data}, f, indent=4)

print("✅ Merging completed! 'CSVqas' added successfully to the original JSON.")


✅ Merging completed! 'CSVqas' added successfully to the original JSON.
