# Business Contract Validation

In [None]:
#Install Dependencies and download spacy model
!pip install torch torchvision torchaudio
!pip install pdfplumber spacy difflib
!python -m spacy download en_core_web_sm

In [1]:
#Import Dependencies
import pdfplumber
import spacy
from difflib import unified_diff
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
# Load spaCy model for NER
nlp = spacy.load('en_core_web_sm')

In [3]:
# Custom dataset for PyTorch
class ContractDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids = False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text':text,
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            'labels':torch.tensor(label, dtype=torch.long)
        }

In [4]:
# Load the pre-trained BERT Model for classification model (dummy)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Example Classification Model
class SimpleClassifier(nn.Module):
    def __init__(self):
        super(SimpleClassifier, self).__init__()
        self.linear = nn.Linear(768, 2)

    def forward(self, x):
        return self.linear(x)


classifier = SimpleClassifier()

In [6]:
# Example function to parse and extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [7]:
# Function to classify text sections (dummy implementation)
def classify_text_sections(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    outputs = model(**inputs)
    _, predicted = torch.max(outputs.logits, dim=1)
    return predicted.item()

In [8]:
# Function to perform NER (Named Entity Recognition)
def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [9]:
# Function to compare texts and highlight differences
def compare_texts(text1, text2):
    diff = unified_diff(text1.splitlines(), text2.splitlines(), lineterm='')
    return '\n'.join(list(diff))

In [10]:
# Main function
def main(pdf_path, template_path):
    contract_text = extract_text_from_pdf(pdf_path)
    template_text = extract_text_from_pdf(template_path)
    # print(contract_text)
    # Classify text sections (example for the entire text)
    classification_result = classify_text_sections(contract_text)
    print("Classification Result:", classification_result)

    # Perform NER
    entities = perform_ner(contract_text)
    print("Named Entities:", entities)

    # Compare with template
    differences = compare_texts(contract_text, template_text)
    print("Differences:\n", differences)

    


In [12]:

if __name__ == "__main__":
    # Paths to the contract and template PDF files
    contract_pdf_path = "contract.pdf"
    template_pdf_path = "template.pdf"

    main(contract_pdf_path, template_pdf_path)

Classification Result: 0
Named Entities: [('This Business Contract', 'ORG'), ('May 30, 2024', 'DATE'), ('ABC Marketing Solutions\nAddress', 'ORG'), ('123', 'CARDINAL'), ('Springfield', 'GPE'), ('IL 62701\nContact', 'ORG'), ('555', 'CARDINAL'), ('123-4567', 'CARDINAL'), ('XYZ Retailers Inc.', 'ORG'), ('456', 'CARDINAL'), ('Springfield', 'GPE'), ('IL 62702', 'ORG'), ('555', 'CARDINAL'), ('987-6543', 'CARDINAL'), ('1', 'CARDINAL'), ('ABC Marketing Solutions', 'ORG'), ('XYZ Retailers Inc.', 'ORG'), ('Digital', 'ORG'), ('2', 'CARDINAL'), ('XYZ Retailers Inc.', 'ORG'), ('ABC Marketing Solutions', 'ORG'), ('10,000', 'MONEY'), ('50%', 'PERCENT'), ('50%', 'PERCENT'), ('June 1, 2024', 'DATE'), ('December 31, 2024', 'DATE'), ('first', 'ORDINAL'), ('4', 'CARDINAL'), ('third', 'ORDINAL'), ('5', 'CARDINAL'), ("30 days'", 'DATE'), ('ABC Marketing Solutions', 'ORG'), ('6', 'CARDINAL'), ('the State of Illinois', 'GPE'), ('7', 'CARDINAL'), ('John Smith', 'PERSON'), ('May 30, 2024', 'DATE'), ('Jane Doe\n