In [None]:
# AI Legal Clause Analyzer - Google Colab Version
# Run this in Google Colab

# ------------------------------
# Installation (run this cell first)
# ------------------------------
!pip install PyPDF2 python-docx pytesseract transformers fpdf2 ipywidgets
!apt-get install tesseract-ocr

import PyPDF2
import docx
import pytesseract
from PIL import Image
from io import BytesIO
from transformers import pipeline
from fpdf import FPDF
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import base64

# ------------------------------
# Initialize AI Models
# ------------------------------
print("Loading AI models...")
try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    print("✅ Models loaded successfully!")
except Exception as e:
    print(f"❌ Error loading models: {e}")

# Sample categories
clause_labels = ["NDA", "Lease", "Arbitration", "Indemnity", "Termination", "Confidentiality", "Liability"]

# ------------------------------
# Helper Functions
# ------------------------------
def extract_text_from_pdf(file_content):
    pdf_reader = PyPDF2.PdfReader(BytesIO(file_content))
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(file_content):
    doc = docx.Document(BytesIO(file_content))
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_image(file_content):
    image = Image.open(BytesIO(file_content))
    return pytesseract.image_to_string(image)

def simplify_text(text):
    # Simplified version - you can enhance this
    simplified = text.replace("herein", "in this document")
    simplified = simplified.replace("thereof", "of that")
    simplified = simplified.replace("heretofore", "before this")
    simplified = simplified.replace("whereas", "since")
    return f"Simplified: {simplified}"

def generate_summary(text):
    try:
        if len(text) < 50:
            return "Text too short to summarize effectively."
        # Truncate if too long
        text = text[:1500] if len(text) > 1500 else text
        result = summarizer(text, max_length=100, min_length=30, do_sample=False)
        return result[0]['summary_text']
    except Exception as e:
        return f"Error generating summary: {e}"

def classify_clause(text):
    try:
        result = classifier(text, candidate_labels=clause_labels)
        return result['labels'][0]
    except Exception as e:
        return f"Error classifying: {e}"

def detect_risks(text):
    risks = []
    text_lower = text.lower()

    if "unlimited liability" in text_lower:
        risks.append("⚠️ Unlimited Liability Detected")
    if "auto-renewal" in text_lower or "automatic renewal" in text_lower:
        risks.append("⚠️ Auto-renewal Clause Found")
    if "termination without notice" in text_lower:
        risks.append("⚠️ Termination without notice detected")
    if "non-compete" in text_lower or "non compete" in text_lower:
        risks.append("⚠️ Non-compete clause found")
    if "penalty" in text_lower and "late" in text_lower:
        risks.append("⚠️ Late penalty clause detected")

    return risks if risks else ["✅ No major risks detected"]

def export_pdf(clauses, summary, classifications, risks):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Title
    pdf.cell(0, 10, "Contract Analysis Report", ln=True, align="C")
    pdf.ln(10)

    # Summary
    pdf.cell(0, 10, "Summary:", ln=True)
    pdf.multi_cell(0, 10, summary)
    pdf.ln(5)

    # Classifications
    pdf.cell(0, 10, "Clause Classifications:", ln=True)
    for i, cl in enumerate(classifications):
        pdf.multi_cell(0, 8, f"Clause {i+1}: {cl}")
    pdf.ln(5)

    # Risks
    pdf.cell(0, 10, "Risk Analysis:", ln=True)
    for risk in risks:
        pdf.multi_cell(0, 8, risk)

    return pdf.output(dest="S").encode("latin-1")

# ------------------------------
# Main Interface
# ------------------------------
class LegalAnalyzer:
    def __init__(self):
        self.current_text = ""
        self.clauses = []
        self.results = {}

    def create_interface(self):
        print("🔍 AI Legal Clause Analyzer")
        print("=" * 50)

        # File uploader
        self.file_upload = widgets.FileUpload(
            accept='.pdf,.docx,.jpg,.png',
            multiple=False,
            description="Upload Contract"
        )

        # Process button
        self.process_btn = widgets.Button(
            description="🔍 Analyze Contract",
            button_style='info',
            layout=widgets.Layout(width='200px')
        )

        # Output area
        self.output = widgets.Output()

        # Set up callbacks
        self.process_btn.on_click(self.process_file)

        # Display interface
        display(widgets.VBox([
            widgets.HTML("<h3>📄 Upload your contract document:</h3>"),
            self.file_upload,
            widgets.HTML("<br>"),
            self.process_btn,
            widgets.HTML("<br>"),
            self.output
        ]))

    def process_file(self, btn):
        with self.output:
            clear_output()

            if not self.file_upload.value:
                print("❌ Please upload a file first!")
                return

            print("🔄 Processing file...")

            # Get file info
            file_info = list(self.file_upload.value.values())[0]
            file_name = file_info['metadata']['name']
            file_content = file_info['content']
            file_type = file_name.split(".")[-1].lower()

            try:
                # Extract text based on file type
                if file_type == "pdf":
                    self.current_text = extract_text_from_pdf(file_content)
                elif file_type == "docx":
                    self.current_text = extract_text_from_docx(file_content)
                elif file_type in ["jpg", "png", "jpeg"]:
                    self.current_text = extract_text_from_image(file_content)
                else:
                    print("❌ Unsupported file format!")
                    return

                print(f"✅ Extracted {len(self.current_text)} characters from {file_name}")

                # Process the text
                self.analyze_contract()

            except Exception as e:
                print(f"❌ Error processing file: {e}")

    def analyze_contract(self):
        print("\n🧠 Analyzing contract...")

        # Split into clauses (simple approach)
        self.clauses = [c.strip() for c in self.current_text.split(".") if len(c.strip()) > 30]

        print(f"📋 Found {len(self.clauses)} clauses")

        # Generate analysis
        print("\n📝 Generating summary...")
        summary = generate_summary(self.current_text)

        print("🏷️ Classifying clauses...")
        classifications = []
        for i, clause in enumerate(self.clauses[:5]):  # Limit to first 5 for demo
            classification = classify_clause(clause)
            classifications.append(f"Clause {i+1}: {classification}")

        print("⚠️ Detecting risks...")
        risks = detect_risks(self.current_text)

        # Display results
        self.display_results(summary, classifications, risks)

        # Store results for export
        self.results = {
            'summary': summary,
            'classifications': classifications,
            'risks': risks
        }

    def display_results(self, summary, classifications, risks):
        print("\n" + "="*50)
        print("📊 ANALYSIS RESULTS")
        print("="*50)

        print(f"\n📝 SUMMARY:")
        print(f"{summary}")

        print(f"\n🏷️ CLAUSE CLASSIFICATIONS:")
        for classification in classifications:
            print(f"  • {classification}")

        print(f"\n⚠️ RISK ANALYSIS:")
        for risk in risks:
            print(f"  • {risk}")

        print(f"\n📄 SIMPLIFIED CLAUSES (First 3):")
        for i, clause in enumerate(self.clauses[:3]):
            print(f"\nClause {i+1}:")
            print(f"Original: {clause[:150]}...")
            print(f"{simplify_text(clause[:200])}")

        # Export option
        if self.results:
            print(f"\n💾 Export options:")
            export_btn = widgets.Button(description="📥 Download PDF Report", button_style='success')
            export_btn.on_click(self.export_report)
            display(export_btn)

    def export_report(self, btn):
        try:
            pdf_content = export_pdf(
                self.clauses[:5],
                self.results['summary'],
                self.results['classifications'],
                self.results['risks']
            )

            # Create download link
            b64 = base64.b64encode(pdf_content).decode()
            href = f'data:application/pdf;base64,{b64}'

            display(HTML(f'''
                <a href="{href}" download="contract_analysis_report.pdf">
                    <button style="background-color: #4CAF50; color: white; padding: 10px 20px; border: none; border-radius: 4px; cursor: pointer;">
                        📥 Download PDF Report
                    </button>
                </a>
            '''))
            print("✅ PDF report ready for download!")

        except Exception as e:
            print(f"❌ Error generating PDF: {e}")

# ------------------------------
# Launch the App
# ------------------------------
if __name__ == "__main__":
    analyzer = LegalAnalyzer()
    analyzer.create_interface()

# ------------------------------
# Alternative: Text Input Version
# ------------------------------
def create_text_input_version():
    """Alternative interface for direct text input"""

    print("\n" + "="*60)
    print("📝 ALTERNATIVE: Direct Text Input Version")
    print("="*60)

    text_area = widgets.Textarea(
        value="",
        placeholder="Paste your contract text here...",
        description="Contract Text:",
        layout=widgets.Layout(width='100%', height='200px')
    )

    analyze_btn = widgets.Button(
        description="🔍 Analyze Text",
        button_style='info'
    )

    output_area = widgets.Output()

    def analyze_text(btn):
        with output_area:
            clear_output()

            if not text_area.value.strip():
                print("❌ Please enter some text first!")
                return

            print("🔄 Analyzing text...")

            # Process the text
            clauses = [c.strip() for c in text_area.value.split(".") if len(c.strip()) > 30]
            summary = generate_summary(text_area.value)
            classifications = [classify_clause(c) for c in clauses[:3]]
            risks = detect_risks(text_area.value)

            # Display results
            print("📊 RESULTS:")
            print(f"\n📝 Summary: {summary}")
            print(f"\n🏷️ Classifications:")
            for i, cl in enumerate(classifications):
                print(f"  Clause {i+1}: {cl}")
            print(f"\n⚠️ Risks:")
            for risk in risks:
                print(f"  {risk}")

    analyze_btn.on_click(analyze_text)

    display(widgets.VBox([
        widgets.HTML("<h3>💡 Alternative: Direct Text Input</h3>"),
        text_area,
        analyze_btn,
        output_area
    ]))

# Uncomment the line below to also show the text input version
# create_text_input_version()

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting fpdf2
  Downloading fpdf2-2.8.4-py2.py3-none-any.whl.metadata (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.7/72.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Do

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


✅ Models loaded successfully!
🔍 AI Legal Clause Analyzer


VBox(children=(HTML(value='<h3>📄 Upload your contract document:</h3>'), FileUpload(value={}, accept='.pdf,.doc…