In [39]:
import pytesseract
from PIL import Image
import pdfplumber
import fitz  # PyMuPDF
import docx
from odf import text, teletype, opendocument
import os, json

# Path to your Tesseract installation
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

print("Libraries and OCR ready")


Libraries and OCR ready


In [40]:
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    text_data = ""

    if ext == ".pdf":
        try:
            # Step 1: Normal text extraction
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text_data += page_text + "\n"

            # Step 2: If no text found → OCR fallback
            if not text_data.strip():
                print("⚠️ No text found in PDF — using OCR fallback.")
                pdf_doc = fitz.open(file_path)
                for page_number, page in enumerate(pdf_doc, start=1):
                    pix = page.get_pixmap(dpi=300)
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    ocr_text = pytesseract.image_to_string(img, lang="eng")
                    text_data += f"\n\n--- Page {page_number} ---\n{ocr_text}"

        except Exception as e:
            print("⚠️ PDF read error:", e)
            # Fallback OCR
            pdf_doc = fitz.open(file_path)
            for page_number, page in enumerate(pdf_doc, start=1):
                pix = page.get_pixmap(dpi=300)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                ocr_text = pytesseract.image_to_string(img, lang="eng")
                text_data += f"\n\n--- Page {page_number} ---\n{ocr_text}"

    elif ext in [".png", ".jpg", ".jpeg"]:
        img = Image.open(file_path)
        text_data = pytesseract.image_to_string(img, lang="eng")

    elif ext == ".docx":
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text_data += para.text + "\n"

    elif ext == ".odt":
        doc = opendocument.load(file_path)
        allparas = doc.getElementsByType(text.P)
        for p in allparas:
            text_data += teletype.extractText(p) + "\n"

    else:
        raise ValueError("Unsupported file type.")

    return text_data.strip()


In [43]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# define model
model_name = "facebook/bart-large-cnn"

# load model and tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# create summarizer pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

print("Summarization model loaded")


Device set to use cpu


Summarization model loaded


In [44]:
def generate_summary(text, max_chunk_tokens=900):
    """
    Summarizes any length of text — short or long.
    Automatically splits into chunks (if too long),
    then combines and refines the summary.
    """

    if not text or not text.strip():
        return "⚠️ No text found to summarize."

    # short documents 
    if len(text.split()) < 100:
        result = summarizer(text, max_length=150, min_length=30, do_sample=False)
        return result[0]['summary_text']

    # Tokenize text into chunks
    inputs = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]
    token_chunks = [inputs[i:i + max_chunk_tokens] for i in range(0, len(inputs), max_chunk_tokens)]

    summaries = []
    for chunk in token_chunks:
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        try:
            summary = summarizer(chunk_text, max_length=250, min_length=80, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print("⚠️ Skipped one chunk due to:", e)
            continue

    combined_summary = " ".join(summaries)

    # Compress again if multiple chunks
    if len(summaries) > 1:
        final_summary = summarizer(
            combined_summary,
            max_length=300,
            min_length=100,
            do_sample=False
        )[0]['summary_text']
        return final_summary
    else:
        return combined_summary


In [45]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Define model
model_name = "facebook/bart-large-cnn"

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# create summarizer pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
print("Summarization model loaded")

Device set to use cpu


Summarization model loaded


In [46]:
def generate_summary(text, max_chunk_tokens=900):
    """
    Summarizes any length of text — short or long.
    - Automatically splits into chunks (if too long)
    - Safely combines partial summaries
    - Produces one clean final summary
    """
    # short text 
    if len(text.split()) < 100:
        result = summarizer(text, max_length=150, min_length=30, do_sample=False)
        return result[0]['summary_text']

    # Tokenize full text into IDs (no truncation)
    inputs = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]

    # Split tokens safely
    token_chunks = [inputs[i:i + max_chunk_tokens] for i in range(0, len(inputs), max_chunk_tokens)]

    summaries = []
    for chunk in token_chunks:
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        try:
            summary = summarizer(
                chunk_text,
                max_length=250,
                min_length=80,
                do_sample=False
            )[0]['summary_text']
            summaries.append(summary)
        except Exception as e:
            print("⚠️ Skipped one chunk due to:", e)
            continue

    # Combine all small summaries
    combined_summary = " ".join(summaries)

    # If multiple chunks, compress again
    if len(summaries) > 1:
        final_summary = summarizer(
            combined_summary,
            max_length=300,
            min_length=100,
            do_sample=False
        )[0]['summary_text']
        return final_summary
    else:
        return combined_summary


In [47]:
sample_path = r"E:\single-doc-summarizer\data\raw\notice1.pdf"

extracted = extract_text_from_file(sample_path)
print(f"Extracted {len(extracted)} characters from the file.")

summary = generate_summary(extracted)

print("\nSummary:\n")
print(summary)


⚠️ No text found in PDF — using OCR fallback.
Extracted 1287 characters from the file.

Summary:

The Bareilly City Police will be organizing an awareness programme titled “Nave Apradhik Kanoon ke Prati Jagrukta Abhiyan’ atSRMS_ College of Engineering and Technology, Bareilly. The programme will take place on November 25, 2025 (Saturday) at 2:00 PM in the NewSeminar Hall. All B.Tech First Year students are required to attend the programme.


In [48]:
output_folder = r"E:\single-doc-summarizer\data\processed"
os.makedirs(output_folder, exist_ok=True)

output_file = os.path.join(output_folder, "notice1_summary.txt")
with open(output_file, "w", encoding="utf-8") as f:
    f.write(summary)

print(f"\n Summary saved to: {output_file}")



 Summary saved to: E:\single-doc-summarizer\data\processed\notice1_summary.txt
