# Step 2: Dataset Generation

In [8]:
import pandas as pd
from faker import Faker

In [16]:
classes = {
    0: "Services Provided", 
    1: "Payment", 
    2: "Term", 
    3: "Confidentiality", 
    4: "Termination", 
    5: "Governing Law", 
    6: "Signatures"
}

In [12]:
fake: Faker = Faker()

# Creating Contract Text using Faker 
def generate_contract_text() -> str:
    # generate random details
    service_provider_name = fake.company()
    client_name = fake.company()
    amount = fake.random_number(digits=5)
    start_date = fake.date_this_year()
    end_date = fake.date_this_year()
    state = fake.state()
    notice_days = fake.random_int(min=30, max=90)

    # Contract text template
    datasets =[
        [f"{service_provider_name} agrees to provide the following services to {client_name}. services are service1 service2, service3.", 0],
        [f"{client_name} agrees to pay {service_provider_name} the amount of ${amount} for the services described above. Payment shall be made within {notice_days} days of receiving an invoice from {service_provider_name}.", 1],
        [f"This contract will commence on {start_date} and will continue until {end_date} unless terminated earlier in accordance with the Termination clause.", 2],
        [f"Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the term of this contract. This obligation will continue beyond the termination of this contract.", 3],
        [f"Either party may terminate this contract with {notice_days} days written notice to the other party. In the event of termination, {service_provider_name} will be compensated for all services performed up to the date of termination.", 4],
        [f"This contract shall be governed by and construed in accordance with the laws of the State of {state}.", 5],
        [f"{service_provider_name}", 6],
        [f"{client_name}", 6]
    ]

    return pd.DataFrame(datasets)

In [14]:
df = generate_contract_text()

In [15]:
df1 = generate_contract_text()

In [30]:
all_df = []
for i in range(10):
    all_df.append(generate_contract_text())
df_concat = pd.concat(all_df)
df_concat.columns = ["features", "label"]
df_concat.to_csv("test.csv", index=False)

# Step 3: Fine Tune The Bert Model to classify Clauses

In [17]:
import pandas as pd 
from sklearn.model_selection import train_test_split

# Read the dataset
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,features,label
0,Lozano-Ellis agrees to provide the following s...,0
1,"Day, Morrison and Vega agrees to pay Lozano-El...",1
2,This contract will commence on 2024-06-08 and ...,2
3,Both parties agree to maintain the confidentia...,3
4,Either party may terminate this contract with ...,4


In [2]:
df.shape

(400, 2)

In [3]:
# Train and Test data split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df.label)
# Check the dataset
train_df.head()

Unnamed: 0,features,label
331,Both parties agree to maintain the confidentia...,3
212,Either party may terminate this contract with ...,4
301,This contract shall be governed by and constru...,5
235,Both parties agree to maintain the confidentia...,3
19,Both parties agree to maintain the confidentia...,3


In [4]:
train_df['label'].value_counts()

label
6    80
3    40
4    40
5    40
1    40
0    40
2    40
Name: count, dtype: int64

In [5]:
val_df['label'].value_counts()

label
6    20
2    10
4    10
3    10
1    10
5    10
0    10
Name: count, dtype: int64

### Tokenize the Text

In [6]:
from transformers import BertTokenizer

# Load Bert Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize  the input text 
def tokenize_function(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

In [7]:
train_texts = train_df['features'].values
val_texts = val_df['features'].values

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

In [8]:
train_encodings['input_ids'][0]

tensor([  101,  2119,  4243,  5993,  2000,  5441,  1996, 18777,  3012,  1997,
         2151, 16350,  2030, 18777,  2592, 21362,  2076,  1996,  2744,  1997,
         2023,  3206,  1012,  2023, 14987,  2097,  3613,  3458,  1996, 18287,
         1997,  2023,  3206,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class ContractDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key:val[idx] for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_labels = train_df['label'].values
val_labels = val_df['label'].values

train_dataset = ContractDataset(train_encodings, train_labels)
val_dataset = ContractDataset(val_encodings, val_labels)

In [10]:
# Example Dataset
example = iter(train_dataset)
example_item = next(example)
example_item

{'input_ids': tensor([  101,  2119,  4243,  5993,  2000,  5441,  1996, 18777,  3012,  1997,
          2151, 16350,  2030, 18777,  2592, 21362,  2076,  1996,  2744,  1997,
          2023,  3206,  1012,  2023, 14987,  2097,  3613,  3458,  1996, 18287,
          1997,  2023,  3206,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'label': tensor(3)}

### Fine-Tune the BERT Model 

In [11]:
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler, AdamW

# Load the Bert model of sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=8,
    shuffle=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

In [13]:
# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
# Training Loop
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    scheduler.step()
    avg_train_loss = total_loss/len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")
        

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1, Loss: 1.0072647355496884
Epoch 2, Loss: 0.21494141407310963
Epoch 3, Loss: 0.17988903261721134
Epoch 4, Loss: 0.17756499648094176
Epoch 5, Loss: 0.17544155176728965


In [20]:
# Validation Loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        output = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(output.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = correct / total
print(f"Validation Accuracy:{accuracy:.4f}")
    
    

Validation Accuracy:1.0000


In [21]:
# Save the fine-tuned model and tokenizer
model_save_path = './fine_tuned_bert'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to ./fine_tuned_bert


# Step 4: Use the Fine Tuned Bert model for clause prediction

In [1]:
# Import Dependencies 
from transformers import BertForSequenceClassification, BertTokenizer
import torch

In [2]:
# Load the fine tuned model 
model_load_path = "./fine_tuned_bert"
model = BertForSequenceClassification.from_pretrained(model_load_path)
tokenizer = BertTokenizer.from_pretrained(model_load_path)

In [3]:
# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, truncation=True, max_length=128, return_tensors='pt', padding=True)

    # Move inputs to GPU if available
    inputs = {key:val.to(device) for key, val in inputs.items()}

    # Perform inference 
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1)
        return predicted_class.item()

In [15]:

# Example text for inference
example_text = "Cole LLC agrees to provide the following services to Hines, Munoz and Dennis. services are service1 service2, service3."
predicted_class = predict(example_text)
print(f"Predicted class: {predicted_class}")

Predicted class: 0


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [53]:
random_sample = df.sample(1)
print(random_sample['features'].iloc[0])
print(random_sample['label'].iloc[0])

Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the term of this contract. This obligation will continue beyond the termination of this contract.
3


In [47]:
for i in range(5):
    random_sample = df.sample(1)
    example_text_2 = random_sample['features'].iloc[0]
    example_class_2 = random_sample['label'].iloc[0]
    predicted_class = predict(example_text_2)
    print(f"Actual :{classes[example_class_2]}\nPredicted : {classes[predicted_class]}\n\n")
    

Actual :Governing Law
Predicted : Governing Law


Actual :Governing Law
Predicted : Governing Law


Actual :Termination
Predicted : Termination


Actual :Term
Predicted : Term


Actual :Signatures
Predicted : Signatures




In [11]:
k = {"a":[2, 243, 2],"b":[43, 23, 1]}

In [12]:
def summa(a, b):
    return a, b

In [13]:
summa(**k)

([2, 243, 2], [43, 23, 1])

In [None]:
import fitz  # PyMuPDF
import re
from transformers import pipeline

# Load the document
document_path = 'pdf'
doc = fitz.open(document_path)

# Extract text from PDF
def extract_text(doc):
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Parse the text into clauses and sub-clauses
def parse_clauses(text):
    clauses = re.split(r'\b\d+\.\s', text)[1:]  # Splitting by numbered headings
    parsed_clauses = {}
    for i, clause in enumerate(clauses, start=1):
        sub_clauses = re.split(r'\b[a-z]\)\s', clause)  # Splitting sub-clauses
        parsed_clauses[f'Clause {i}'] = sub_clauses
    return parsed_clauses

# Classify the content using a pre-trained text classification model
def classify_clauses(parsed_clauses):
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    labels = ["Services Provided", "Payment", "Term", "Confidentiality", "Termination", "Governing Law", "Signatures"]
    
    classified_clauses = {}
    for clause, sub_clauses in parsed_clauses.items():
        classified_clauses[clause] = []
        for sub_clause in sub_clauses:
            result = classifier(sub_clause, candidate_labels=labels)
            classified_clauses[clause].append((sub_clause, result['labels'][0]))
    return classified_clauses

# Main script execution
text = extract_text(doc)
parsed_clauses = parse_clauses(text)
classified_clauses = classify_clauses(parsed_clauses)

# Print the parsed and classified clauses
for clause, sub_clauses in classified_clauses.items():
    print(f"{clause}:")
    for sub_clause, classification in sub_clauses:
        print(f"  {classification}: {sub_clause}")

