# Introduction

This notebook demonstrates how to build a pipeline for extracting, processing, and summarizing English text from PDF documents using various natural language processing (NLP) libraries. We will use several key tools including PyPDF2 for extracting text from PDFs, nltk for text preprocessing, and transformer-based models for generating summaries and other text-based analyses.

The key objectives of this notebook are:

1. **Extract Text**: Extract text from PDF documents using pdfplumber.
2. **Preprocess Text**: Tokenize and clean the extracted text using nltk.
3. **Summarization**: Apply a transformer-based model to summarize large text bodies using transformers.
4. **Output**: Display the results, including summaries and other outputs generated from the NLP pipeline.


This notebook is designed to work with a variety of text-heavy documents, allowing users to quickly extract key insights and summaries from large volumes of text.

# English Book Summarization Pipeline: 

### 1. Library Installation

In [None]:
# Install necessary libraries if not installed
!pip install PyPDF2
!pip install nltk
!pip install sentence-transformers
!pip install fpdf
!pip install transformers
!pip install pdfplumber

### 2. Importing Libraries


In [None]:
import pdfplumber
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from fpdf import FPDF

### 3. PDF to Text Conversion


In [None]:
# Function to convert PDF to text using pdfplumber
def pdf_to_text_plumber(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

# Provide the path to your PDF file
pdf_path = "/kaggle/input/hhhhhhh/The man who mistook his wife for a hat and other clinical tales.pdf"
book_text = pdf_to_text_plumber(pdf_path)

### 4. Semantic Chunking of the Text

In [None]:
# Load a pre-trained Sentence-BERT model for semantic embeddings (ensure GPU usage)
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # Set device to 'cuda' for GPU

def divide_by_semantics_with_length(text, threshold=0.6, max_words=1000, min_words=400):
    sentences = text.split('. ')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    chunks = []
    current_chunk = sentences[0]
    
    for i in range(1, len(sentences)):
        similarity = util.pytorch_cos_sim(embeddings[i], embeddings[i-1])
        current_word_count = len(current_chunk.split())

        if similarity < threshold or current_word_count + len(sentences[i].split()) > max_words:
            if current_word_count >= min_words:
                chunks.append(current_chunk.strip())
                current_chunk = sentences[i]
            else:
                current_chunk += '. ' + sentences[i]
        else:
            current_chunk += '. ' + sentences[i]
    
    if len(current_chunk.split()) >= min_words:
        chunks.append(current_chunk.strip())
    
    return chunks

semantic_chunks = divide_by_semantics_with_length(book_text)

### 5. Clean the Text Chunks


In [None]:
# Clean text by removing unwanted characters
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

def clean_chunks(chunks):
    return [clean_text(chunk) for chunk in chunks]

cleaned_semantic_chunks = clean_chunks(semantic_chunks)

### 6. Summarize the Cleaned Chunks

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)  # Use GPU (device=0)

def summarize_chunks(chunks):
    summaries = []
    for chunk in chunks:
        chunk_length = len(chunk.split())
        if chunk_length > 50:
            try:
                summary = summarizer(chunk, max_length=chunk_length, min_length=20, do_sample=False)[0]['summary_text']
                summaries.append(summary)
            except Exception as e:
                print(f"Error summarizing chunk: {e}")
                summaries.append(chunk)
        else:
            summaries.append(chunk)
    return summaries

summarized_chunks = summarize_chunks(cleaned_semantic_chunks)

### 7. Generate Overall Summary

In [None]:
def overall_summary(summaries):
    structured_summary = ""
    for i, summary in enumerate(summaries, 1):
        structured_summary += summary + "\n\n"
    return structured_summary

final_summary = overall_summary(summarized_chunks)

# Print the final structured summary
print(f"Final Summary:\n{final_summary}")

### 8. Export the Summary to a PDF


In [None]:
def strip_unicode(text):
    return text.encode('latin-1', 'ignore').decode('latin-1')

class PDF(FPDF):
    def header(self):
        if self.page_no() == 1:
            self.set_font('Arial', 'B', 12)
            self.cell(0, 10, 'Book Summary', ln=True, align='C')
            self.ln(10)
    
    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_text(self, text):
        self.add_page()
        self.chapter_body(text)

# Create PDF instance
pdf = PDF()

# Clean the final summary to remove any non-latin-1 characters
cleaned_summary = strip_unicode(final_summary)

# Add the cleaned summary text to the PDF
pdf.add_text(cleaned_summary)

# Save the PDF to a specified file path
file_path = "D:\\summary.pdf"
pdf.output(file_path)

# Display the file path
file_path