In [None]:
import fitz  # PyMuPDF
import re
from transformers import pipeline

# Load the document
document_path = 'pdf'
doc = fitz.open(document_path)

# Extract text from PDF
def extract_text(doc):
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Parse the text into clauses and sub-clauses
def parse_clauses(text):
    clauses = re.split(r'\b\d+\.\s', text)[1:]  # Splitting by numbered headings
    parsed_clauses = {}
    for i, clause in enumerate(clauses, start=1):
        sub_clauses = re.split(r'\b[a-z]\)\s', clause)  # Splitting sub-clauses
        parsed_clauses[f'Clause {i}'] = sub_clauses
    return parsed_clauses

# Classify the content using a pre-trained text classification model
def classify_clauses(parsed_clauses):
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    labels = ["Services Provided", "Payment", "Term", "Confidentiality", "Termination", "Governing Law", "Signatures"]
    
    classified_clauses = {}
    for clause, sub_clauses in parsed_clauses.items():
        classified_clauses[clause] = []
        for sub_clause in sub_clauses:
            result = classifier(sub_clause, candidate_labels=labels)
            classified_clauses[clause].append((sub_clause, result['labels'][0]))
    return classified_clauses

# Main script execution
text = extract_text(doc)
parsed_clauses = parse_clauses(text)
classified_clauses = classify_clauses(parsed_clauses)

# Print the parsed and classified clauses
for clause, sub_clauses in classified_clauses.items():
    print(f"{clause}:")
    for sub_clause, classification in sub_clauses:
        print(f"  {classification}: {sub_clause}")

