In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import difflib
import json

import context
from cont_gen.utils import load_json, load_jsonl

In [2]:
all_data = load_json('/storage/rhshui/workspace/datasets/legal/CUAD_v1/CUAD_v1.json')['data']
master = pd.read_csv('/storage/rhshui/workspace/datasets/legal/CUAD_v1/master_clauses.csv')

In [3]:
class MatchTitle:
    """Given a query contract title, return the index of matched title"""
    def __init__(self, titles):
        self.titles = titles
    
    def __call__(self, query):
        if query in self.titles:
            return self.titles.index(query)
        scores = [difflib.SequenceMatcher(a = query, b = tt).ratio() for tt in self.titles]
        return np.argmax(scores)

In [4]:
# Some file names are mismatched. Fix it.
def rm_fn_suffix(s):
    return s.removesuffix('.PDF').removesuffix('.pdf')
titles = [k['title'] for k in all_data]
title2index = {d['title']:i for i, d in enumerate(all_data)}

filenames = master['Filename'].apply(rm_fn_suffix)
nb_miss = len([k for k in filenames if k not in title2index])
print(f'Number of missing filenames: {nb_miss}')

matcher = MatchTitle(titles)
indexes = [matcher(k) for k in filenames]
print(len(set(indexes)))
master.insert(0, 'Index', indexes)

master.head(3)

Number of missing filenames: 11
510


Unnamed: 0,Index,Filename,Document Name,Document Name-Answer,Parties,Parties-Answer,Agreement Date,Agreement Date-Answer,Effective Date,Effective Date-Answer,...,Liquidated Damages,Liquidated Damages-Answer,Warranty Duration,Warranty Duration-Answer,Insurance,Insurance-Answer,Covenant Not To Sue,Covenant Not To Sue-Answer,Third Party Beneficiary,Third Party Beneficiary-Answer
0,374,CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...,['MARKETING AFFILIATE AGREEMENT'],MARKETING AFFILIATE AGREEMENT,"['BIRCH FIRST GLOBAL INVESTMENTS INC.', 'MA', ...","Birch First Global Investments Inc. (""Company""...","['8th day of May 2014', 'May 8, 2014']",5/8/14,['This agreement shall begin upon the date of ...,,...,[],No,"[""COMPANY'S SOLE AND EXCLUSIVE LIABILITY FOR T...",Yes,[],No,[],No,[],No
1,191,EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...,['VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT'],VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT,"['EuroMedia Holdings Corp.', 'Rogers', 'Rogers...","Rogers Cable Communications Inc. (""Rogers""); E...","['July 11 , 2006']",7/11/06,"['July 11 , 2006']",7/11/06,...,[],No,[],No,[],No,[],No,[],No
2,230,FulucaiProductionsLtd_20131223_10-Q_EX-10.9_83...,['CONTENT DISTRIBUTION AND LICENSE AGREEMENT'],CONTENT DISTRIBUTION AND LICENSE AGREEMENT,"['Producer', 'Fulucai Productions Ltd.', 'Conv...","CONVERGTV, INC. (“ConvergTV”); Fulucai Product...","['November 15, 2012']",11/15/12,"['November 15, 2012']",11/15/12,...,[],No,[],No,[],No,[],No,[],No


In [5]:
print(master.columns)

Index(['Index', 'Filename', 'Document Name', 'Document Name-Answer', 'Parties',
       'Parties-Answer', 'Agreement Date', 'Agreement Date-Answer',
       'Effective Date', 'Effective Date-Answer', 'Expiration Date',
       'Expiration Date-Answer', 'Renewal Term', 'Renewal Term-Answer',
       'Notice Period To Terminate Renewal',
       'Notice Period To Terminate Renewal- Answer', 'Governing Law',
       'Governing Law-Answer', 'Most Favored Nation',
       'Most Favored Nation-Answer', 'Competitive Restriction Exception',
       'Competitive Restriction Exception-Answer', 'Non-Compete',
       'Non-Compete-Answer', 'Exclusivity', 'Exclusivity-Answer',
       'No-Solicit Of Customers', 'No-Solicit Of Customers-Answer',
       'No-Solicit Of Employees', 'No-Solicit Of Employees-Answer',
       'Non-Disparagement', 'Non-Disparagement-Answer',
       'Termination For Convenience', 'Termination For Convenience-Answer',
       'Rofr/Rofo/Rofn', 'Rofr/Rofo/Rofn-Answer', 'Change Of Control

In [6]:
def filter_nonempty_list(df, col):
    return df[df[col].apply(lambda k: len(eval(k)) > 0)]

In [7]:
ctype = 'Renewal Term'
ctype = 'Notice Period To Terminate Renewal' # 'Notice to Terminate Renewal'
ctype = 'Governing Law'
ctype = 'Most Favored Nation'

# master[ctype].head(3)
part = filter_nonempty_list(master, ctype)
print(len(part))
part[['Index', 'Filename', ctype]].head(10)

28


Unnamed: 0,Index,Filename,Most Favored Nation
1,191,EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...,['In the event that Licensor grants to another...
7,338,IntegrityMediaInc_20010329_10-K405_EX-10.17_23...,"['If for any reason, Integrity and TL are subj..."
9,185,TomOnlineInc_20060501_20-F_EX-4.46_749700_EX-4...,"[""The Company will, and Online BVI will cause ..."
15,345,GentechHoldingsInc_20190808_1-A_EX1A-6 MAT CTR...,"[""Such Prices and Volume Discount Prices shall..."
94,403,TheglobeComInc_19990503_S-1A_EX-10.20_5416126_...,['All Users shall be treated at least as favor...
124,444,LiquidmetalTechnologiesInc_20200205_8-K_EX-10....,['Eutectix agrees that in the event any Licens...
130,27,ChinaRealEstateInformationCorp_20090929_F-1_EX...,['In the event E-House Research and Training I...
153,368,LejuHoldingsLtd_20140121_DRS (on F-1)_EX-10.26...,['In the event E-House Research and Training I...
158,489,PlayboyEnterprisesInc_20090220_10-QA_EX-10.2_4...,['The parties agree that Client will not be c...
213,319,GAINSCOINC_01_21_2010-EX-10.41-SPONSORSHIP AGR...,"[""The Sponsor acknowledges that Racing has arr..."


In [11]:
ctype = 'Non-Compete'
ctype = 'Exclusivity'
ctype = 'No-Solicit Of Employees' # "No-Solicit of Employees"
ctype = 'No-Solicit Of Customers'
part = filter_nonempty_list(master, ctype)
print(len(part))
part[['Index', 'Filename', ctype]].head(10)

34


Unnamed: 0,Index,Filename,No-Solicit Of Customers
15,345,GentechHoldingsInc_20190808_1-A_EX1A-6 MAT CTR...,"[""The Company shall not contact any of Distrib..."
87,428,InvendaCorp_20000828_S-1A_EX-10.2_2588206_EX-1...,"[""In the event that the Agreement is terminate..."
93,286,RandWorldwideInc_20010402_8-KA_EX-10.2_2102464...,"['During the Term of this Agreement, and for a..."
97,129,DigitalCinemaDestinationsCorp_20111220_S-1_EX-...,"[""During the Term and for a period of twelve (..."
100,484,SteelVaultCorp_20081224_10-K_EX-10.16_3074935_...,['Marketing Affiliate shall not directly or in...
103,165,UsioInc_20040428_SB-2_EX-10.11_1723988_EX-10.1...,"['Additionally, upon termination of this contr..."
123,52,LegacyEducationAllianceInc_20200330_10-K_EX-10...,"['Further, during the Term, except as otherwis..."
128,149,RitterPharmaceuticalsInc_20200313_S-4A_EX-10.5...,"['Sekisui shall not, and shall cause its subdi..."
156,13,PfHospitalityGroupInc_20150923_10-12G_EX-10.1_...,['Member covenants and agrees that during the ...
157,30,PfHospitalityGroupInc_20150923_10-12G_EX-10.1_...,"[""You may not enter into any relationship with..."
