In [1]:
from PyPDF2 import PdfReader
from transformers import pipeline
import ipywidgets as widgets
from IPython.display import display, clear_output
import nltk
from nltk.tokenize import sent_tokenize
import torch
import os

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading punkt_tab: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [2]:
uploader = widgets.FileUpload(
    accept='.pdf',
    multiple=False,
    description='Upload PDF',
    layout=widgets.Layout(width='50%', margin='20px')
)

upload_status = widgets.Output()
display(widgets.VBox([uploader, upload_status]))

VBox(children=(FileUpload(value=(), accept='.pdf', description='Upload PDF', layout=Layout(margin='20px', widt…

In [3]:
def extract_text_from_pdf(uploaded_file):
    try:
        pdf_reader = PdfReader(uploaded_file)
        text = ""
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            text += page_text if page_text else ''
        return text
    except Exception as e:
        print(f"Text extraction error: {str(e)}")
        return ""

In [4]:
def chunk_text(text, max_chunk_size=512, overlap=100):
    if not text.strip():
        return []
    
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        words = sentence.split()
        if current_length + len(words) > max_chunk_size and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = current_chunk[-overlap:] if overlap else []
            current_length = sum(len(s.split()) for s in current_chunk)
        current_chunk.append(sentence)
        current_length += len(words)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return [chunk for chunk in chunks if len(chunk.split()) > 20]

In [5]:
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=0 if torch.cuda.is_available() else -1
)



In [6]:
def generate_summary(contents):
    try:
        text = extract_text_from_pdf(contents)
        if not text or len(text.split()) < 50:
            return "Error: Document too short (needs 50+ words)"
        
        chunks = chunk_text(text)
        if not chunks:
            return "Error: No valid text chunks created"
        
        chunk_summaries = []
        for chunk in chunks:
            word_count = len(chunk.split())
            try:
                # Dynamic length calculation
                max_len = min(150, max(30, word_count//2))  # Ensures max >= 30
                min_len = min(30, max_len-1)  # Ensures min < max
                
                result = summarizer(
                    chunk,
                    max_length=max_len,
                    min_length=min_len,
                    do_sample=False,
                    truncation=True
                )
                if result:
                    chunk_summaries.append(result[0]['summary_text'])
            except Exception as e:
                print(f"Chunk skipped: {str(e)}")
                continue
        
        if not chunk_summaries:
            return "Error: No valid summaries generated"
        
        combined_summary = ' '.join(chunk_summaries)
        combined_word_count = len(combined_summary.split())
        
        # Final summary length calculation
        final_max = min(300, max(50, combined_word_count//2))
        final_min = min(50, final_max-1)
        
        final_result = summarizer(
            combined_summary,
            max_length=final_max,
            min_length=final_min,
            do_sample=False,
            truncation=True
        )
        
        return final_result[0]['summary_text'] if final_result else "Summary generation failed"
    
    except Exception as e:
        return f"Processing error: {str(e)}"

In [7]:
def handle_upload(change):
    with upload_status:
        clear_output()
        
        if uploader.value:
            try:
                file_info = uploader.value[0]
                file_name = file_info['name']
                content = file_info['content']
                
                with open(file_name, 'wb') as f:
                    f.write(content)
                
                print("🔄 Processing document...")
                summary = generate_summary(file_name)
                
                print("\n✅ SUMMARY:")
                print("-" * 50)
                print(summary)
                print("-" * 50)
                
                if os.path.exists(file_name):
                    os.remove(file_name)
                    
            except Exception as e:
                print("\n❌ PROCESSING FAILED:")
                print("-" * 50)
                print(f"Error: {str(e)}")
                print("-" * 50)
                print("Please check:")
                print("1. PDF has selectable text (not scanned)")
                print("2. Document has sufficient content (2+ pages)")
                print("3. File isn't password protected")

uploader.observe(handle_upload, names='value')