In [None]:
%%writefile README.md

#  ClauseLens: Clause-Level Legal Contract Analyzer

**ClauseLens** is an AI-powered tool that detects and analyzes key legal clauses in contracts using NLP and local LLMs. It helps legal professionals quickly identify risks, ambiguities, and imbalances — all with **100% offline, private inference**.

> No data leaves your machine — runs entirely locally.



##  Key Features

- **Detects 8+ Legal Clauses**
  `CONFIDENTIALITY`, `TERMINATION`, `NON_COMPETE`, `LIABILITY`, `FORCE_MAJEURE`, `PAYMENT_TERMS`, and more.
-  **Local LLM Analysis**
  Powered by **TinyLlama-1.1B** via `llama.cpp` — no cloud API, no data leakage.
-  **Privacy-First Design**
  Runs 100% offline after setup — ideal for sensitive legal documents.
-  **PDF & DOCX Support**
  Extracts text from both formats using `PyMuPDF` and `python-docx`.
-  **Exportable Reports**
  Download results as **JSON** or **PDF** for sharing and review.
-  **Gradio Web Interface**
  Clean, interactive UI for easy use.

---

##  How It Works

1. **Upload**: User uploads a PDF or DOCX contract.
2. **Parse**: Text is extracted using `PyMuPDF` / `python-docx`.
3. **Detect**: spaCy's `Matcher` identifies clause keywords.
4. **Analyze**: Local LLM (TinyLlama) reviews each clause for risk, fairness, and suggestions.
5. **Report**: Results are displayed and exported as JSON/PDF.


Respond:
- Risk Level: Low/Medium/High
- Feedback: Brief analysis
- Suggestions: Bullet points for improvement

Writing README.md


In [None]:
%%writefile requirements.txt

gradio
pymupdf
python-docx
spacy
llama-cpp-python==0.2.70
huggingface-hub
fpdf2
tqdm
numpy

https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl

Writing requirements.txt


In [None]:
pip install -r requirements.txt

In [None]:
import os
from google.colab import files

os.system("rm -rf data/*")

print("📎 Upload your real legal contracts (PDF/DOCX):")
uploaded = files.upload()

📎 Upload your real legal contracts (PDF/DOCX):


Saving LEGAL-CONFIDENTIAL-AGREEMENTS (1).pdf to LEGAL-CONFIDENTIAL-AGREEMENTS (1).pdf
Saving LEGAL-CONFIDENTIAL-AGREEMENTS (2).pdf to LEGAL-CONFIDENTIAL-AGREEMENTS (2).pdf
Saving LEGAL-CONFIDENTIAL-AGREEMENTS (3).pdf to LEGAL-CONFIDENTIAL-AGREEMENTS (3).pdf
Saving LEGAL-CONFIDENTIAL-AGREEMENTS (4).pdf to LEGAL-CONFIDENTIAL-AGREEMENTS (4).pdf
Saving LEGAL-CONFIDENTIAL-AGREEMENTS (5).pdf to LEGAL-CONFIDENTIAL-AGREEMENTS (5).pdf


In [None]:
os.makedirs("data", exist_ok=True)
for filename in uploaded.keys():
    with open(os.path.join("data", filename), "wb") as f:
        f.write(uploaded[filename])
    print(f"✅ Saved: {filename}")

✅ Saved: LEGAL-CONFIDENTIAL-AGREEMENTS (1).pdf
✅ Saved: LEGAL-CONFIDENTIAL-AGREEMENTS (2).pdf
✅ Saved: LEGAL-CONFIDENTIAL-AGREEMENTS (3).pdf
✅ Saved: LEGAL-CONFIDENTIAL-AGREEMENTS (4).pdf
✅ Saved: LEGAL-CONFIDENTIAL-AGREEMENTS (5).pdf


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import shutil
shutil.copytree("data", "/content/drive/MyDrive/legal_contracts", dirs_exist_ok=True)

print("✅ All documents saved to Google Drive under '/legal_contracts'")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ All documents saved to Google Drive under '/legal_contracts'


In [None]:
%%writefile parser.py

import fitz
from docx import Document
import os

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using PyMuPDF."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
        return ""

def extract_text_from_docx(docx_path):
    """Extract text from DOCX using python-docx."""
    try:
        doc = Document(docx_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error reading DOCX {docx_path}: {e}")
        return ""

def load_documents(folder="data"):
    """Load all supported documents from a folder."""
    texts = []
    for file in os.listdir(folder):
        path = os.path.join(folder, file)
        if file.endswith(".pdf"):
            texts.append(extract_text_from_pdf(path))
        elif file.endswith(".docx"):
            texts.append(extract_text_from_docx(path))
        else:
            print(f"⚠ Skipped unsupported file: {file}")
    return texts

Writing parser.py


In [None]:
texts = load_documents('data')
print(f"Loaded {len(texts)} documents")
print("\nFirst 500 chars of text:")
print(texts[0][:500])

Loaded 5 documents

First 500 chars of text:
  
EX-10.2 3 mye-ex10_2.htm EX-10.2
Exhibit 10.2
NON-COMPETITION, NON-SOLICITATION AND CONFIDENTIALITY AGREEMENT
 
THIS NON-COMPETITION, NON-SOLICITATION and CONFIDENTIALITY AGREEMENT (this “Agreement”) is entered into effective as of 
November 21, 2024 (the “Effective Date”), between Myers Industries, Inc., an Ohio Corporation (the “Company”) and Aaron M. Schapper (the “Employee”).
RECITALS:
1.
The Company is a diversified international manufacturer of polymer products for the industrial, agric


In [None]:
%%writefile spacy_matcher.py

import spacy
from spacy.matcher import Matcher


nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)


clause_patterns = {
    "CONFIDENTIALITY": [
        [{"LOWER": "confidentiality"}],
        [{"LOWER": "non-disclosure"}],
        [{"LOWER": "nda"}],
        [{"LOWER": "proprietary"}, {"LOWER": "information"}],
    ],
    "TERMINATION": [
        [{"LOWER": "termination"}],
        [{"LOWER": "end"}, {"LOWER": "of"}, {"LOWER": "agreement"}],
        [{"LOWER": "terminate"}, {"LOWER": "this"}, {"LOWER": "agreement"}],
    ],
    "NON_COMPETE": [
        [{"LOWER": "non-compete"}],
        [{"LOWER": "non"}, {"LOWER": "compete"}],
        [{"LOWER": "competition"}, {"LOWER": "restriction"}],
    ],
    "GOVERNING_LAW": [
        [{"LOWER": "governing"}, {"LOWER": "law"}],
        [{"LOWER": "jurisdiction"}],
        [{"LOWER": "choice"}, {"LOWER": "of"}, {"LOWER": "law"}],
    ],
    "SEVERABILITY": [
        [{"LOWER": "severability"}],
        [{"LOWER": "invalidity"}, {"OP": "?"}, {"LOWER": "provision"}],
        [{"LOWER": "severable"}],
    ],
    "LIABILITY": [
        [{"LOWER": "liability"}],
        [{"LOWER": "limitation"}, {"LOWER": "of"}, {"LOWER": "liability"}],
        [{"LOWER": "indemnification"}],
        [{"LOWER": "cap"}, {"LOWER": "on"}, {"LOWER": "damages"}],
    ],
    "FORCE_MAJEURE": [
        [{"LOWER": "force"}, {"LOWER": "majeure"}],
        [{"LOWER": "acts"}, {"LOWER": "of"}, {"LOWER": "god"}],
        [{"LOWER": "unforeseen"}, {"LOWER": "events"}],
        [{"LOWER": "pandemic"}],
    ],
    "PAYMENT_TERMS": [
        [{"LOWER": "payment"}, {"LOWER": "terms"}],
        [{"LOWER": "due"}, {"LOWER": "within"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["days", "weeks"]}}],
        [{"LOWER": "invoice"}, {"LOWER": "shall"}, {"LOWER": "be"}],
        [{"LOWER": "net"}, {"IS_DIGIT": True}],
    ]
}

# Adding all patterns to matcher
for label, patterns in clause_patterns.items():
    for pattern in patterns:
        matcher.add(label, [pattern])

print("Clause matcher loaded with extended patterns.")

def find_clauses(text, window_size=30):
    """Find clauses in text and return context."""
    doc = nlp(text)
    matches = matcher(doc)
    results = []
    for match_id, start, end in matches:
        span = doc[start:end]
        label = nlp.vocab.strings[match_id]
        # Get context window
        ctx_start = max(0, start - window_size)
        ctx_end = min(len(doc), end + window_size)
        context = doc[ctx_start:ctx_end].text
        results.append((label, context, start, end))
    return results

Writing spacy_matcher.py


In [None]:
texts_1 = load_documents('data')
raw_texts = texts_1[0]

doc = nlp(raw_texts)
matches = matcher(doc)
matched_clauses = []

for match_id, start, end in matches:
  span = doc[start:end]
  label =nlp.vocab.strings[match_id]
  matched_clauses.append((label, span.text, start, end))

print("Found clauses:")
for clause in matched_clauses:
  print(clause)


Found clauses:
('CONFIDENTIALITY', 'CONFIDENTIALITY', 17, 18)
('CONFIDENTIALITY', 'CONFIDENTIALITY', 29, 30)
('CONFIDENTIALITY', 'confidentiality', 622, 623)
('CONFIDENTIALITY', 'Confidentiality', 673, 674)
('TERMINATION', 'termination', 939, 940)
('TERMINATION', 'termination', 950, 951)
('TERMINATION', 'termination', 1131, 1132)
('TERMINATION', 'termination', 1144, 1145)
('TERMINATION', 'termination', 2497, 2498)
('GOVERNING_LAW', 'jurisdiction', 2930, 2931)
('GOVERNING_LAW', 'jurisdiction', 3250, 3251)
('GOVERNING_LAW', 'jurisdiction', 3263, 3264)


In [None]:
def get_clause_section(doc, start_token , end_token, window_size=20):
  '''
  Get a section around matched clasue
  '''
  start = max(0, start_token - window_size)
  end = min(len(doc), end_token + window_size)
  return doc[start:end].text

example_matched = matched_clauses[0]
label,text,start,end = example_matched

clause_section = get_clause_section(doc, start, end)
print("clause_section:\n")



clause_section:



In [None]:
%%writefile llm_reviewer.py

from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os


# Model Configuration

MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
CACHE_DIR = "models"
MODEL_PATH = os.path.join(CACHE_DIR, MODEL_FILE)


os.makedirs(CACHE_DIR, exist_ok=True)

if not os.path.exists(MODEL_PATH):
    print(" Downloading TinyLlama-1.1B-Chat (Q4_K_M) from Hugging Face...")
    MODEL_PATH = hf_hub_download(
        repo_id=MODEL_NAME,
        filename=MODEL_FILE,
        local_dir=CACHE_DIR
    )
else:
    print(f" Loaded cached model from {MODEL_PATH}")

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=2048,
    n_threads=4,
    n_gpu_layers=0,
    verbose=False
)


# Clause Context Extraction

def get_clause_section(doc, start_token, end_token, window_size=30):
    """
    Extract context around matched clause.
    """
    start = max(0, start_token - window_size)
    end = min(len(doc), end_token + window_size)
    return doc[start:end].text


# LLM Clause Review

def review_clause_with_llm(clause_type, clause_text):
    """
    Send clause to TinyLlama for expert legal review.
    Returns structured feedback.
    """
    prompt = f"""
You are a senior legal expert reviewing a contract clause.

Evaluate:
- Is this clause fair, balanced, and standard?
- Does it overly favor one party?
- Are critical terms missing or ambiguous?

Respond in this format:
- Risk Level: [Low/Medium/High]
- Feedback: Brief professional analysis
- Suggestions: Bullet points for improvement

CLAUSE TYPE: {clause_type.replace('_', ' ').title()}
CLAUSE TEXT: {clause_text[:800]}  # Truncate long clauses
""".strip()

    try:
        output = llm(
            prompt,
            max_tokens=512,
            temperature=0.3,
            stop=["\n\n", "User:", "###"]
        )
        return output['choices'][0]['text'].strip()
    except Exception as e:
        return f" LLM Review Error: {str(e)}"


# Exporting Results

def export_to_json(matched_clauses, filename="clause_reviews.json"):
    """
    Export clause reviews to JSON.
    """
    import json
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(matched_clauses, f, indent=2, ensure_ascii=False)
    print(f" JSON report saved to {filename}")
    return filename

def export_to_pdf(matched_clauses, filename="clause_reviews.pdf"):
    """
    Export clause reviews to a clean PDF report.
    """
    from fpdf import FPDF
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)
    pdf.cell(0, 10, "ClauseLens - Legal Contract Review Report", align='C', ln=True)
    pdf.ln(10)

    for clause in matched_clauses:
        # Clause label
        pdf.set_font("Arial", 'B', 12)
        label = clause.get('label', 'Unknown').replace('_', ' ').title()
        pdf.cell(0, 8, txt=f"Clause: {label}", ln=True)
        pdf.ln(2)

        # Section excerpt
        pdf.set_font("Arial", 'B', 10)
        pdf.cell(0, 8, txt="Excerpt:", ln=True)
        pdf.set_font("Arial", size=10)
        section = clause.get('section', 'N/A')
        pdf.multi_cell(0, 6, txt=section)
        pdf.ln(4)

        # LLM Review
        pdf.set_font("Arial", 'B', 10)
        pdf.cell(0, 8, txt="LLM Review:", ln=True)
        pdf.set_font("Arial", size=10)
        review = clause.get('review', 'No review available')
        pdf.multi_cell(0, 6, txt=review)
        pdf.ln(8)

    pdf.output(filename)
    print(f" PDF report saved to {filename}")
    return filename

Writing llm_reviewer.py


In [None]:
%%writefile app.py

import gradio as gr
import os
import traceback
import time


OUTPUT_DIR = "output"
os.makedirs("data", exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs("models", exist_ok=True)

def process_contract(file):
    try:
        # Clearing  previous outputs
        for f in os.listdir(OUTPUT_DIR):
            os.remove(os.path.join(OUTPUT_DIR, f))


        if file is None:
            yield " No file uploaded.", None, None, None
            return

        file_path = file.name
        ext = os.path.splitext(file_path)[1].lower()

        if ext not in [".pdf", ".docx"]:
            yield f" Unsupported format: {ext}", None, None, None
            return

        yield " Extracting text...", None, None, None
        time.sleep(0.1)

        # Extract text
        from parser import extract_text_from_pdf, extract_text_from_docx
        text = extract_text_from_pdf(file_path) if ext == ".pdf" else extract_text_from_docx(file_path)

        if not text or len(text.strip()) < 10:
            yield "⚠ Failed to extract meaningful text.", None, None, None
            return

        yield " Finding clauses...", None, None, None
        time.sleep(0.1)


        import spacy
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)


        from spacy_matcher import find_clauses
        matches = find_clauses(text)
        if not matches:
            yield " No clauses detected.", None, None, None
            return

        yield f" Analyzing {len(matches)} clauses with LLM...", None, None, None
        time.sleep(0.1)

        # Analyzeing with LLM
        from llm_reviewer import review_clause_with_llm, get_clause_section
        results = []
        for label, _, start, end in matches:
            section = get_clause_section(doc, start, end, window_size=30)
            review = review_clause_with_llm(label, section)
            results.append({
                "label": label,
                "section": section,
                "review": review
            })


        from llm_reviewer import export_to_json, export_to_pdf
        json_path = export_to_json(results, os.path.join(OUTPUT_DIR, "clause_reviews.json"))
        pdf_path = export_to_pdf(results, os.path.join(OUTPUT_DIR, "clause_reviews.pdf"))


        output_text = "##  Clause Reviews\n\n"
        for r in results:
            output_text += f" **{r['label'].replace('_', ' ').title()}**\n\n"
            output_text += f" *Excerpt:* {r['section'][:300]}...\n\n"
            output_text += f" *Review:* {r['review']}\n\n---\n\n"


        found_types = sorted(set(r['label'].replace('_', ' ').title() for r in results))
        clause_list = ", ".join(found_types)


        yield (
            f"Found {len(results)} clauses across {len(found_types)} types:\n\n{clause_list}",
            output_text,
            json_path,
            pdf_path
        )

    except Exception as e:
        tb = traceback.format_exc()
        error_msg = f" Error: {str(e)}\n\n```\n{tb}\n```"
        yield error_msg, None, None, None


# Gradio Interface
with gr.Blocks(title="ClauseLens - Legal Contract Analyzer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("#  ClauseLens: Legal Contract Analyzer")
    gr.Markdown("Upload a legal contract (PDF or DOCX) for clause detection and LLM-powered review.")

    with gr.Row():
        file_input = gr.File(label="Upload Contract", file_types=[".pdf", ".docx"])

    with gr.Row():
        btn = gr.Button(" Analyze Contract", variant="primary")

    with gr.Row():
        status = gr.Textbox(label="Status")

    with gr.Row():
        output = gr.Markdown(label="Clause Reviews")

    with gr.Row():
        gr.Markdown("### 📎 Download Reports")

    with gr.Row():
        json_download = gr.File(label="Download JSON Report")
        pdf_download = gr.File(label="Download PDF Report")


    btn.click(
        fn=process_contract,
        inputs=file_input,
        outputs=[status, output, json_download, pdf_download]
    )

# Enable queuing for streaming
demo.queue()

if __name__ == "__main__":
    try:
        demo.launch(share=True)
    except Exception as e:
        print(f"Launch failed: {e}")

Writing app.py
