<a href="https://colab.research.google.com/github/siddugoud6966/Automated-Legal-Document-Summarizer-NLP-AI-/blob/main/Automated_Legal_Document_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *Main Code*

In [17]:
import os
import csv
import warnings
import fitz  # PyMuPDF
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
from sklearn.exceptions import ConvergenceWarning

# --- Colab specific: upload PDFs ---
from google.colab import files

print("Please upload your PDF files now (multiple allowed).")
uploaded = files.upload()  # This opens file picker to upload PDFs

# Uploaded files will be saved in /content
directory_path = "/content"

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Optional: suppress transformers warnings
from transformers import logging
logging.set_verbosity_error()

# Summarization pipeline setup
model_name = "shresthasingh/my_awesome_billsum_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# NER pipeline setup
ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def summarize_text(text, min_length=30):
    return summarizer(text, min_length=min_length, do_sample=False)[0]['summary_text']

def chunk_text(text, chunk_size=512):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def recursive_summarize(text, chunk_size=300, min_length=30):
    if len(text.split()) <= chunk_size:
        return summarize_text(text, min_length)

    chunks = chunk_text(text, chunk_size)
    summaries = [summarize_text(chunk, min_length) for chunk in chunks]
    combined_summary = ' '.join(summaries)
    return recursive_summarize(combined_summary, chunk_size, min_length)

def extract_named_entities(text, chunk_size=256):
    chunks = chunk_text(text, chunk_size)
    entities = {'PER': set(), 'ORG': set(), 'LOC': set()}

    for chunk in chunks:
        ner_results = ner_pipeline(chunk)
        for result in ner_results:
            entity_type = result['entity_group']  # Use grouped_entities=True for consistency
            if entity_type in entities:
                entities[entity_type].add(result['word'])

    return entities

def process_legal_document(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    summary = recursive_summarize(text)
    entities = extract_named_entities(text)
    return summary, entities

def process_directory(directory_path):
    output_data = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            print(f"Processing {filename}...")
            summary, entities = process_legal_document(pdf_path)
            persons = ', '.join(entities['PER'])
            organizations = ', '.join(entities['ORG'])
            locations = ', '.join(entities['LOC'])
            output_data.append([filename, summary, persons, organizations, locations])

    output_csv = os.path.join(directory_path, "legal_document_analysis.csv")
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File name", "File Summary", "Persons of Interest", "Organizations of Interest", "Locations of Interest"])
        for data in output_data:
            writer.writerow(data)

    print(f"\nAnalysis results have been saved to: {os.path.abspath(output_csv)}")

# Run the processing on uploaded files
process_directory(directory_path)


Please upload your PDF files now (multiple allowed).


Saving SIDDU_CV.pdf to SIDDU_CV.pdf




Processing Cer.pdf...
Processing SIDDU_CV.pdf...

Analysis results have been saved to: /content/legal_document_analysis.csv


#  *To download the csv file*

In [18]:
from google.colab import files
files.download('/content/legal_document_analysis.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>