# Installs and Configuration

In [1]:
# Install packages, set configuration as needed
# gensim, worddcloud, python-Levenshtein


In [47]:
# Import packages
# general purpose use throughout
import numpy as np
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import matplotlib.colors as col
from importlib import reload
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex

# feature extraction/selection methods
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer

# model methods
from sklearn.model_selection import train_test_split, KFold, cross_val_score

# custom files to keep notebook clean
import auxiliary as aux
import nlp_functions

import random


%matplotlib inline

# Global Variables

In [23]:
# seed for random number/state generators
SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

# These are the columns with Yes/No Answers
YN_ANS_COLS = ['Most Favored Nation-Answer', 'Competitive Restriction Exception-Answer', 
    'Non-Compete-Answer', 'Exclusivity-Answer', 'No-Solicit Of Customers-Answer',
    'No-Solicit Of Employees-Answer', 'Non-Disparagement-Answer', 
    'Termination For Convenience-Answer', 'Rofr/Rofo/Rofn-Answer', 'Change Of Control-Answer', 
    'Anti-Assignment-Answer', 'Revenue/Profit Sharing-Answer', 'Price Restrictions-Answer', 
    'Minimum Commitment-Answer', 'Volume Restriction-Answer', 'Ip Ownership Assignment-Answer', 
    'Joint Ip Ownership-Answer', 'License Grant-Answer', 'Non-Transferable License-Answer', 
    'Affiliate License-Licensor-Answer', 'Affiliate License-Licensee-Answer', 
    'Unlimited/All-You-Can-Eat-License-Answer', 'Irrevocable Or Perpetual License-Answer', 
    'Source Code Escrow-Answer', 'Post-Termination Services-Answer', 'Audit Rights-Answer', 
    'Uncapped Liability-Answer', 'Cap On Liability-Answer', 'Liquidated Damages-Answer', 
    'Insurance-Answer', 'Covenant Not To Sue-Answer','Third Party Beneficiary-Answer']

# column names for corresponding contract text
YN_TXT_COLS = ['Most Favored Nation', 'Competitive Restriction Exception', 'Non-Compete', 
    'Exclusivity', 'No-Solicit Of Customers', 'No-Solicit Of Employees', 'Non-Disparagement', 
    'Termination For Convenience', 'Rofr/Rofo/Rofn', 'Change Of Control', 'Anti-Assignment', 
    'Revenue/Profit Sharing', 'Price Restrictions', 'Minimum Commitment', 'Volume Restriction', 
    'Ip Ownership Assignment', 'Joint Ip Ownership', 'License Grant', 'Non-Transferable License',
    'Affiliate License-Licensor', 'Affiliate License-Licensee', 'Unlimited/All-You-Can-Eat-License', 
    'Irrevocable Or Perpetual License', 'Source Code Escrow', 'Post-Termination Services',
    'Audit Rights', 'Uncapped Liability', 'Cap On Liability', 'Liquidated Damages', 'Insurance',
    'Covenant Not To Sue','Third Party Beneficiary']

# Load and Arrange Data

CUAD_v1.zip was downloaded from https://zenodo.org/record/4595826#.YJp6hqhKiUk checksum md5:c38f490a984420b8a62600db401fafd5) on 5 May 2021. The file of interest is master_clauses.csv, but I do not know of a way to access/download only this file.

In [3]:
# through manual inspection of the csv, blanks or empty data can occur in the following formats:
na_values = {'[\'[â— ]\']', '[]', '[]/[]/[]', '[\'[*]\']', '[]/[]/[][]', ''}

# read the csv, marking as null the values listed above
df_master = pd.read_csv("master_clauses.csv", header=0, na_values=na_values)
# df_master.set_index('Filename')


The first of 83 columns of the dataframe is the text filename from the corpus. Each of the other 41 labels is represented by two columns: the extracted portion of contract text relevant to that label category, and the answer to the question posed by the category. For the first 8 categories and #38 Warrant Duration, the answers are relevant dates, durations, or names. The remaining 32 answers are Yes/No to indicate whether such a clause exists in the contract. The description of each of the 41 labels can be found at https://www.atticusprojectai.org/atticus-labels.

In [4]:
# display(df_master)
# df_master.head()
# df_master.info()
# df_master.columns
# df_master.shape

# Transforming Data

Next we will process, transform, and analyze the data.
For the first exploration, we will isolate the Yes/No answers and view the correlation

## Transforming the Answer Columns into 0s and 1s

In [6]:
ans_pkl = 'ans01.pkl'
if os.path.exists(ans_pkl):
    df_01_ans = pd.read_pickle(ans_pkl)
else:
    # Make a new dataframe that contains only the Y/N columns
    df_yn_ans = df_master[YN_ANS_COLS]
    # display(df_yn)
    # Transform the columns to numbers
    df_01_ans = pd.DataFrame(np.where(df_yn_ans.values == 'Yes', 1, 0), df_yn_ans.index, columns=YN_ANS_COLS)
    df_01_ans.to_pickle(ans_pkl)
# display(df_yn01)
print('Nonzeros (of 510 contracts) in each column')
print(df_01_ans.astype(bool).sum(axis=0))
corr = aux.correlation_views(df=df_01_ans, print_top=20, diag=-2)

Nonzeros (of 510 contracts) in each column
Most Favored Nation-Answer                   28
Competitive Restriction Exception-Answer     76
Non-Compete-Answer                          119
Exclusivity-Answer                          180
No-Solicit Of Customers-Answer               34
No-Solicit Of Employees-Answer               59
Non-Disparagement-Answer                     38
Termination For Convenience-Answer          183
Rofr/Rofo/Rofn-Answer                        85
Change Of Control-Answer                    121
Anti-Assignment-Answer                      374
Revenue/Profit Sharing-Answer               166
Price Restrictions-Answer                    15
Minimum Commitment-Answer                   165
Volume Restriction-Answer                    82
Ip Ownership Assignment-Answer              124
Joint Ip Ownership-Answer                    46
License Grant-Answer                        255
Non-Transferable License-Answer             138
Affiliate License-Licensor-Answer            

Unnamed: 0,Most Favored Nation-Answer,Competitive Restriction Exception-Answer,Non-Compete-Answer,Exclusivity-Answer,No-Solicit Of Customers-Answer,No-Solicit Of Employees-Answer,Non-Disparagement-Answer,Termination For Convenience-Answer,Rofr/Rofo/Rofn-Answer,Change Of Control-Answer,Anti-Assignment-Answer,Revenue/Profit Sharing-Answer,Price Restrictions-Answer,Minimum Commitment-Answer,Volume Restriction-Answer,Ip Ownership Assignment-Answer,Joint Ip Ownership-Answer,License Grant-Answer,Non-Transferable License-Answer,Affiliate License-Licensor-Answer,Affiliate License-Licensee-Answer,Unlimited/All-You-Can-Eat-License-Answer,Irrevocable Or Perpetual License-Answer,Source Code Escrow-Answer,Post-Termination Services-Answer,Audit Rights-Answer,Uncapped Liability-Answer,Cap On Liability-Answer,Liquidated Damages-Answer,Insurance-Answer,Covenant Not To Sue-Answer,Third Party Beneficiary-Answer
Most Favored Nation-Answer,-2.0,0.02,0.05,0.15,-0.03,-0.01,-0.0,0.05,0.05,0.05,0.13,0.07,0.01,0.13,0.15,0.06,0.1,0.09,0.05,0.07,0.07,0.1,0.05,-0.04,0.11,0.11,0.06,0.12,-0.01,0.05,0.03,0.01
Competitive Restriction Exception-Answer,0.02,-2.0,0.38,0.35,0.24,0.24,0.13,-0.03,0.27,0.22,0.19,0.19,0.02,0.19,0.13,0.19,0.04,0.23,0.18,0.07,0.07,-0.05,0.06,0.07,0.19,0.21,0.15,0.14,0.15,0.16,0.14,0.12
Non-Compete-Answer,0.05,0.38,-2.0,0.25,0.26,0.25,0.11,0.07,0.25,0.2,0.22,0.33,-0.04,0.14,0.14,0.23,0.18,0.27,0.25,0.08,0.06,0.0,0.17,0.06,0.29,0.25,0.16,0.15,0.14,0.15,0.24,0.11
Exclusivity-Answer,0.15,0.35,0.25,-2.0,0.05,0.02,0.13,-0.06,0.25,0.19,0.22,0.26,0.07,0.28,0.22,0.19,0.21,0.39,0.23,0.02,0.17,0.05,0.1,-0.02,0.28,0.27,0.11,0.18,0.07,0.23,0.23,-0.0
No-Solicit Of Customers-Answer,-0.03,0.24,0.26,0.05,-2.0,0.35,0.07,0.03,0.11,0.13,0.09,0.1,0.05,0.03,0.01,0.18,0.05,0.02,0.03,0.02,0.0,-0.05,-0.02,0.01,0.11,0.11,0.03,0.03,0.12,0.13,0.15,0.09
No-Solicit Of Employees-Answer,-0.01,0.24,0.25,0.02,0.35,-2.0,0.08,0.22,0.17,0.16,0.12,0.08,0.05,0.03,-0.04,0.24,-0.05,0.07,0.06,0.04,-0.02,-0.03,0.11,0.06,0.14,0.04,0.02,0.03,0.17,0.13,0.11,0.13
Non-Disparagement-Answer,-0.0,0.13,0.11,0.13,0.07,0.08,-2.0,-0.06,0.05,0.03,0.1,0.04,-0.05,0.09,0.04,0.14,0.01,0.15,0.13,-0.03,-0.01,-0.05,0.02,-0.05,0.07,0.02,-0.04,-0.02,0.03,0.14,0.22,0.02
Termination For Convenience-Answer,0.05,-0.03,0.07,-0.06,0.03,0.22,-0.06,-2.0,0.02,0.08,0.19,-0.01,-0.01,-0.02,-0.07,0.14,0.04,0.09,0.1,0.09,0.13,-0.07,0.18,0.14,0.13,0.13,0.09,0.24,0.1,0.19,-0.02,0.01
Rofr/Rofo/Rofn-Answer,0.05,0.27,0.25,0.25,0.11,0.17,0.05,0.02,-2.0,0.18,0.19,0.21,0.02,0.16,0.02,0.18,0.19,0.17,0.15,0.08,0.13,0.0,0.16,-0.01,0.21,0.2,0.11,0.13,0.06,0.23,0.11,0.1
Change Of Control-Answer,0.05,0.22,0.2,0.19,0.13,0.16,0.03,0.08,0.18,-2.0,0.32,0.18,0.07,0.07,0.02,0.3,0.15,0.26,0.26,0.15,0.17,-0.0,0.14,0.11,0.23,0.31,0.14,0.25,0.18,0.18,0.22,0.08


Top Absolute Correlations (excluding diagonals)
License Grant-Answer                      Non-Transferable License-Answer            0.591417
Uncapped Liability-Answer                 Cap On Liability-Answer                    0.487576
Affiliate License-Licensor-Answer         Affiliate License-Licensee-Answer          0.423597
Audit Rights-Answer                       Cap On Liability-Answer                    0.411370
Post-Termination Services-Answer          Audit Rights-Answer                        0.403331
Anti-Assignment-Answer                    License Grant-Answer                       0.399059
Exclusivity-Answer                        License Grant-Answer                       0.393893
Affiliate License-Licensee-Answer         Irrevocable Or Perpetual License-Answer    0.390179
Ip Ownership Assignment-Answer            Irrevocable Or Perpetual License-Answer    0.384941
Competitive Restriction Exception-Answer  Non-Compete-Answer                         0.381004
License Gran

Very few of the columns had significant correlation with each other. The higest correlation values suggest a moderate relationship between some of the columns.

## Transforming the Text Columns into One 'Sentence' per Observation

In [8]:
# Creates a new dataframe with only text from each YN field
data = {'sent' : [], 'Category':[]}
display(data)
# df_test = df_master[YN_TXT_COLS].stack().dropna().reset_index(drop=True)
# display(df_test)
df_yn_tmp = df_master[YN_TXT_COLS].replace(np.nan, '', regex=True)
# display(df_yn_tmp)
for col in YN_TXT_COLS:
    for i in range(len(df_yn_tmp[col])):
        sentence = df_yn_tmp[col].loc[i]
        if sentence:
            data['sent'].append(sentence)
            data['Category'].append(col)
del df_yn_tmp
df_yn_tmp = pd.DataFrame(data=data, columns=['sent', 'Category'])
display(df_yn_tmp)

{'sent': [], 'Category': []}

Unnamed: 0,sent,Category
0,['In the event that Licensor grants to another...,Most Favored Nation
1,"['If for any reason, Integrity and TL are subj...",Most Favored Nation
2,"[""The Company will, and Online BVI will cause ...",Most Favored Nation
3,"[""Such Prices and Volume Discount Prices shall...",Most Favored Nation
4,['All Users shall be treated at least as favor...,Most Favored Nation
...,...,...
3606,['Member hereby acknowledges and agrees that F...,Third Party Beneficiary
3607,['Lessor and Lessee expressly agree that Franc...,Third Party Beneficiary
3608,"['Except as expressly provided in Section 8, t...",Third Party Beneficiary
3609,"['Changepoint, Inc. (""Changepoint"") shall be a...",Third Party Beneficiary


## Apply Text Preprocessing to Sentences
- Remove special characters, punctuation, non-ASCII characters
- Expand contractions
- Normalize to lowercase
- Tokenization
- Lemmatization

In [88]:
token_file = 'sentences.pkl'
if os.path.exists(token_file):
    df_yn_txt = pd.read_pickle(token_file)
#     display(df_yn_txt)
else:
    df_yn_tmp['Clean'] = df_yn_tmp.loc[:,'sent'].apply(nlp_functions.clean_up_text)
    df_yn_tmp['Sentence'] = df_yn_tmp.loc[:,'Clean'].apply(nlp_functions.tokenize)
    # store the result in an np array of shape (510,)
    df_yn_txt = df_yn_tmp[['Sentence', 'Category']]
    df_yn_txt.to_pickle(token_file)
    print(f'{token_file} written.')
#     display(df_yn_txt)

## Text forms of x and y
Will need to convert/reduce strings to numbers for classification algorithms

In [91]:
display(df_yn_txt)
x_txt = df_yn_txt['Sentence'].squeeze()
y_txt = df_yn_txt['Category'].squeeze()

Unnamed: 0,Sentence,Category
0,"[event, licensor, grant, another, vod, pay, pe...",Most Favored Nation
1,"[reason, integrity, subject, lower, free, good...",Most Favored Nation
2,"[company, online, bvi, cause, company, use, co...",Most Favored Nation
3,"[price, volume, discount, price, shall, subjec...",Most Favored Nation
4,"[user, shall, treat, least, favorable, respect...",Most Favored Nation
...,...,...
3606,"[member, hereby, acknowledge, agree, franchiso...",Third Party Beneficiary
3607,"[lessor, lessee, expressly, agree, franchisor,...",Third Party Beneficiary
3608,"[except, expressly, provide, section, agreemen...",Third Party Beneficiary
3609,"[changepoint, inc, changepoint, shall, direct,...",Third Party Beneficiary


## Converting the y_txt to y_num

In [100]:
y_num = y_txt.loc[:,].apply(YN_TXT_COLS.index)
print(y_num)

0        0
1        0
2        0
3        0
4        0
        ..
3606    31
3607    31
3608    31
3609    31
3610    31
Name: Category, Length: 3611, dtype: int64


## Converting the x_txt to word counts

Here we investigate different methods of feature extraction and word embedding for use in classification algorithims

In [152]:
total_vocab, class_vocab = aux.build_vocab_word_counts(x_txt, y_num)

In [156]:
print(f'There are {len(total_vocab)} unique tokenized words from {len(x_txt)} sentences in the x_txt vocabulary')
print(f'There are {len(class_vocab[0])} unique tokenized words for class 0 vocabulary')
print(f'Array from total_vocab[\'event\'] : {total_vocab["event"]}')
print(f'Count from class_vocab[0][\'event\'] : {class_vocab[0]["event"]}')
print(f'The 10 most common words and counts in class 0 are: \n{class_vocab[0].most_common(10)}')

There are 6249 unique tokenized words from 3611 sentences in the x_txt vocabulary
There are 476 unique tokenized words for class 0 vocabulary
Array from total_vocab['event'] : [  9.  30.  21.  27.   1.   6.   5.   7.  60.  88.  47.  35.   2.  80.
  15.  16.   7.  39.  16.   2.   3.   0.  10.  18.  98.  21.  81. 267.
  40.  30.   9.   2.]
Count from class_vocab[0]['event'] : 9
The 10 most common words in class 0 are: 
[('shall', 46), ('agreement', 45), ('terms', 36), ('provide', 26), ('product', 26), ('party', 25), ('license', 23), ('price', 23), ('fee', 16), ('favorable', 16)]


In [151]:
# run this to update the auxiliary import after making changes
reload(aux)

<module 'auxiliary' from 'C:\\Users\\raind\\WORK\\AFIT\\21SP\\CSCE623\\Project\\auxiliary.py'>

## Splitting the data into train and test sets before further processing

In [None]:
X_train, X_test, y_train, y_test = train_test_split

### Method 2: Using CountVectorizer

In [98]:
vectorizer = CountVectorizer(max_features=2500, min_df=5, max_df=0.7)
# uses the unproccessed data['sent'] instead of tokenized x_txt
x_count = vectorizer.fit_transform(data['sent']).toarray()

In [95]:
display(x_count[0].nonzero())
display(len(x_count[0].nonzero()[0]))

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

(array([  35,  150,  151,  170,  182,  191,  194,  201,  216,  225,  231,
         270,  282,  294,  301,  302,  363,  369,  394,  414,  549,  585,
         633,  747,  750,  777,  782,  783,  793,  844,  845,  848,  879,
         914,  955,  958,  986, 1060, 1064, 1080, 1093, 1095, 1138, 1139,
        1146, 1162, 1166, 1230, 1259, 1296, 1324, 1325, 1330, 1339, 1365,
        1404, 1417, 1467, 1520, 1530, 1541, 1551, 1569, 1577, 1606, 1652,
        1659, 1664, 1670, 1679, 1741, 1789, 1791, 1818, 1819, 1820, 2003,
        2009, 2013, 2066, 2069, 2080, 2154, 2200, 2205, 2208, 2220, 2254,
        2259, 2261, 2265, 2266, 2272, 2280, 2281, 2288, 2290, 2291, 2410,
        2418, 2446, 2452, 2453, 2468, 2472, 2487, 2492], dtype=int64),)

107

# Classification Methods

## Supervised Methods

### Random Forest Classifier

In [35]:
# aux.randomforest_demo(sentences=x_txt, y=y_txt, ynames=YN_TXT_COLS, seed=SEED)

classifier classes


array(['Affiliate License-Licensee', 'Affiliate License-Licensor',
       'Anti-Assignment', 'Audit Rights', 'Cap On Liability',
       'Change Of Control', 'Competitive Restriction Exception',
       'Covenant Not To Sue', 'Exclusivity', 'Insurance',
       'Ip Ownership Assignment', 'Irrevocable Or Perpetual License',
       'Joint Ip Ownership', 'License Grant', 'Liquidated Damages',
       'Minimum Commitment', 'Most Favored Nation',
       'No-Solicit Of Customers', 'No-Solicit Of Employees',
       'Non-Compete', 'Non-Disparagement', 'Non-Transferable License',
       'Post-Termination Services', 'Price Restrictions',
       'Revenue/Profit Sharing', 'Rofr/Rofo/Rofn', 'Source Code Escrow',
       'Termination For Convenience', 'Third Party Beneficiary',
       'Uncapped Liability', 'Unlimited/All-You-Can-Eat-License',
       'Volume Restriction'], dtype=object)

[[ 1  1  1 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0 84 ...  0  0  0]
 ...
 [ 0  0  0 ...  3  0  0]
 [ 0  0  0 ...  0  0  3]
 [ 0  0  1 ...  0  0  6]]
                                   precision    recall  f1-score   support

       Affiliate License-Licensee       1.00      0.06      0.12        16
       Affiliate License-Licensor       0.00      0.00      0.00         2
                  Anti-Assignment       0.79      0.97      0.87        87
                     Audit Rights       0.84      0.97      0.90        38
                 Cap On Liability       0.64      0.78      0.70        50
                Change Of Control       0.67      0.43      0.53        23
Competitive Restriction Exception       0.71      0.26      0.38        19
              Covenant Not To Sue       1.00      0.86      0.92        21
                      Exclusivity       0.44      0.33      0.37        43
                        Insurance       0.88      1.00      0.93        35
          Ip Owner

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier(n_estimators=1000,
                       random_state=RandomState(MT19937) at 0x29E22369040)

## Unsupervised Methods

### Word2Vec

In [38]:
# aux.word2vec_demo(names=y_txt, sentences=x_txt, vector_size=100, workers=1, seed=random.seed(SEED))

Rows of vectorized array: 3611, Length of first vector: 100
For n_clusters = 50
Silhouette coefficient: 0.02
Inertia:40.243440395216815
Silhouette values:
    Cluster 19: Size:4 | Avg:0.37 | Min:0.27 | Max: 0.45
    Cluster 41: Size:38 | Avg:0.11 | Min:0.00 | Max: 0.23
    Cluster 48: Size:113 | Avg:0.09 | Min:0.01 | Max: 0.22
    Cluster 20: Size:93 | Avg:0.09 | Min:-0.03 | Max: 0.23
    Cluster 28: Size:116 | Avg:0.08 | Min:-0.01 | Max: 0.22
    Cluster 46: Size:134 | Avg:0.08 | Min:-0.05 | Max: 0.19
    Cluster 47: Size:103 | Avg:0.07 | Min:-0.02 | Max: 0.19
    Cluster 23: Size:48 | Avg:0.07 | Min:-0.09 | Max: 0.21
    Cluster 44: Size:47 | Avg:0.07 | Min:-0.04 | Max: 0.19
    Cluster 8: Size:121 | Avg:0.06 | Min:-0.01 | Max: 0.19
    Cluster 1: Size:142 | Avg:0.06 | Min:-0.02 | Max: 0.19
    Cluster 29: Size:110 | Avg:0.05 | Min:-0.03 | Max: 0.16
    Cluster 13: Size:72 | Avg:0.04 | Min:-0.08 | Max: 0.19
    Cluster 45: Size:61 | Avg:0.04 | Min:-0.06 | Max: 0.15
    Cluster 43: Si

Unnamed: 0,text,tokens,cluster
0,Most Favored Nation,"['event', 'licensor', 'grant', 'another', 'vod...",15
1,Most Favored Nation,"['reason', 'integrity', 'subject', 'lower', 'f...",40
2,Most Favored Nation,"['company', 'online', 'bvi', 'cause', 'company...",14
3,Most Favored Nation,"['price', 'volume', 'discount', 'price', 'shal...",25
4,Most Favored Nation,"['user', 'shall', 'treat', 'least', 'favorable...",32
...,...,...,...
3606,Third Party Beneficiary,"['member', 'hereby', 'acknowledge', 'agree', '...",34
3607,Third Party Beneficiary,"['lessor', 'lessee', 'expressly', 'agree', 'fr...",40
3608,Third Party Beneficiary,"['except', 'expressly', 'provide', 'section', ...",15
3609,Third Party Beneficiary,"['changepoint', 'inc', 'changepoint', 'shall',...",38


Most representative terms per cluster (based on centroids):
Cluster 0: ,   ' [ i 
Cluster 1: ,   ' i [ 
Cluster 2: ,   ' [ i 
Cluster 3: ,   ' [ i 
Cluster 4: ,   ' [ i 
Cluster 5: ,   ' i [ 
Cluster 6: ,   ' i [ 
Cluster 7: ,   ' [ i 
Cluster 8: ,   ' i [ 
Cluster 9: ,   ' i [ 
Cluster 10: ,   ' [ i 
Cluster 11: ,   ' i [ 
Cluster 12: ,   ' i [ 
Cluster 13: ,   ' [ i 
Cluster 14: ,   ' [ i 
Cluster 15: ,   ' [ i 
Cluster 16: ,   ' [ i 
Cluster 17: ,   ' i [ 
Cluster 18: ,   ' i [ 
Cluster 19: ,   ' [ i 
Cluster 20: ,   ' i [ 
Cluster 21: ,   i [ ' 
Cluster 22: ,   ' i [ 
Cluster 23: ,   ' [ i 
Cluster 24: ,   ' [ i 
Cluster 25: ,   ' i [ 
Cluster 26: ,   ' [ i 
Cluster 27: ,   ' i [ 
Cluster 28: ,   ' i [ 
Cluster 29: ,   ' i [ 
Cluster 30: ,   ' i [ 
Cluster 31: ,   ' [ ] 
Cluster 32: ,   ' i [ 
Cluster 33: ,   ' [ i 
Cluster 34: ,   ' [ i 
Cluster 35: ,   ' [ i 
Cluster 36: ,   ' i [ 
Cluster 37: ,   ' [ i 
Cluster 38: ,   ' [ ] 
Cluster 39: ,   ' [ i 
Cluster 40: ,   ' i [ 
Cluster

Unnamed: 0,text,tokens,cluster
0,Most Favored Nation,"['event', 'licensor', 'grant', 'another', 'vod...",15
1,Most Favored Nation,"['reason', 'integrity', 'subject', 'lower', 'f...",40
2,Most Favored Nation,"['company', 'online', 'bvi', 'cause', 'company...",14
3,Most Favored Nation,"['price', 'volume', 'discount', 'price', 'shal...",25
4,Most Favored Nation,"['user', 'shall', 'treat', 'least', 'favorable...",32
...,...,...,...
3606,Third Party Beneficiary,"['member', 'hereby', 'acknowledge', 'agree', '...",34
3607,Third Party Beneficiary,"['lessor', 'lessee', 'expressly', 'agree', 'fr...",40
3608,Third Party Beneficiary,"['except', 'expressly', 'provide', 'section', ...",15
3609,Third Party Beneficiary,"['changepoint', 'inc', 'changepoint', 'shall',...",38
