In [4]:
import fitz  # PyMuPDF
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)

# Define the contract clauses template
clauses = [
    "Services Provided",
    "Payment",
    "Term",
    "Confidentiality",
    "Termination",
    "Governing Law",
    "Signatures"
]

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def classify_clause(text, clauses):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return clauses[predictions.item()]

def find_deviations(template_text, contract_text):
    template_lines = template_text.split('\n')
    contract_lines = contract_text.split('\n')
    deviations = []

    for line in contract_lines:
        if line not in template_lines:
            deviations.append(line)
    
    return deviations

def main():
    template_path = 'template.pdf'
    contract_path = 'contract.pdf'

    template_text = extract_text_from_pdf(template_path)
    contract_text = extract_text_from_pdf(contract_path)

    contract_lines = contract_text.split('\n')
    entity_label_pairs = []

    for line in contract_lines:
        if line.strip():
            clause_label = classify_clause(line, clauses)
            entity_label_pairs.append((line, clause_label))

    deviations = find_deviations(template_text, contract_text)

    print("Classified Entities and Labels:")
    for entity, label in entity_label_pairs:
        print(f"Entity: {entity}, Label: {label}")

    print("\nDeviations from Template:")
    for deviation in deviations:
        print(deviation)

if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Classified Entities and Labels:
Entity: BUSINESS CONTRACT, Label: Services Provided
Entity: This Business Contract ("Contract") is made and entered into as of May 30, 2024, by and between:, Label: Confidentiality
Entity: Party A:, Label: Confidentiality
Entity: Name: ABC Marketing Solutions, Label: Services Provided
Entity: Address: 123 Market St, Springfield, IL 62701, Label: Services Provided
Entity: Contact: (555) 123-4567, contact@abcmarketing.com, Label: Services Provided
Entity: Party B:, Label: Confidentiality
Entity: Name: XYZ Retailers Inc., Label: Services Provided
Entity: Address: 456 Commerce Blvd, Springfield, IL 62702, Label: Services Provided
Entity: Contact: (555) 987-6543, info@xyzretailers.com, Label: Services Provided
Entity: 1. Services Provided:, Label: Term
Entity: ABC Marketing Solutions agrees to provide the following services to XYZ Retailers Inc.:, Label: Services Provided
Entity: - Digital marketing strategy development, Label: Services Provided
Entity: - Soc

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TextClassificationPipeline
import torch
from torch.utils.data import Dataset, DataLoader

# Custom Dataset Class
class ContractDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load data
train_df = pd.read_csv('train.csv')
eval_df = pd.read_csv('eval.csv')

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(clauses))

# Create datasets
train_dataset = ContractDataset(
    texts=train_df.text.to_numpy(),
    labels=train_df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=128
)

eval_dataset = ContractDataset(
    texts=eval_df.text.to_numpy(),
    labels=eval_df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=128
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine-tuned-bert')
tokenizer.save_pretrained('./fine-tuned-bert')

# Load the fine-tuned model and tokenizer for inference
fine_tuned_model = BertForSequenceClassification.from_pretrained('./fine-tuned-bert')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [8]:
!pip install accelerate -U

