In [24]:
# # Necessary imports
# !pip install pymupdf
# !pip install openai
# !pip install langchain
# !pip install tiktoken
# !pip install reportlab

## Imports

In [25]:
import fitz
import re
import tiktoken
import os
import warnings
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate


warnings.simplefilter("ignore", UserWarning)

# OpenAI API key Setting
os.environ["OPENAI_API_KEY"] = ""

## Read and Parse pdf

In [26]:
def create_pdf_structure(pdf_content):
    """
    Parses the content of a PDF to create a structured dictionary representing parts, chapters, and their content.

    This function takes the text content of a PDF and processes it to create a structured dictionary where parts are
    represented as keys, chapters are nested within parts, and their content is stored as values.

    Args:
        pdf_content (list of str): A list of text content from the PDF where each element represents a page.

    Returns:
        dict: A dictionary representing the PDF structure with parts, chapters, and content.
              The structure is organized as follows:
              {
                  "Part 1": {
                      "Chapter 1": "Chapter 1 content...",
                      "Chapter 2": "Chapter 2 content...",
                      ...
                  },
                  "Part 2": {
                      "Chapter 1": "Chapter 1 content...",
                      "Chapter 2": "Chapter 2 content...",
                      ...
                  },
                  ...
              }
    """
    pdf_structure = {}
    current_part = None
    current_chapter = None
    current_text = []
    
    if len(pdf_content):
        for page_text in pdf_content:
            lines = page_text.strip().split('\n')
            for line in lines:
                # Check if the line contains a part number (e.g., "Part 1")
                if line.startswith("Part "):
                    if current_part:
                        pdf_structure[current_part][current_chapter] = '\n'.join(current_text)
                    current_part = line.strip()
                    current_chapter = None
                    current_text = []
                    if current_part not in pdf_structure:
                        pdf_structure[current_part] = {}

                # Check if the line contains a chapter number (e.g., "Chapter 1")
                elif line.startswith("Chapter "):
                    if current_chapter:
                        pdf_structure[current_part][current_chapter] = '\n'.join(current_text)
                    current_chapter = line.strip()
                    current_text = []
                    if current_chapter not in pdf_structure[current_part]:
                        pdf_structure[current_part][current_chapter] = []

                # If the line is not a part or chapter, assume it's part of the chapter text
                elif current_chapter:
                    current_text.append(line)

        # Add the last part and chapter
        if current_part and current_chapter:
            chapter_content = '\n'.join(current_text)
            pdf_structure[current_part][current_chapter] = chapter_content
            

    return pdf_structure


def extract_pdf_data(pdf_path):
    """
    Extracts text content from a PDF file.

    This function opens a PDF file located at the specified 'pdf_path' and extracts
    the text content from all pages in the PDF. The extracted text content is
    returned as a list, where each element represents the text content of a single page.

    Args:
        pdf_path (str): The file path to the PDF to be extracted.

    Returns:
        list: A list of strings, where each string represents the text content of a single page in the PDF.

    Raises:
        Exception: If there is an error while processing the PDF file.
    """
    pdf_content = []
    try:
        pdf_document = fitz.open(pdf_path)

        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            page_text = page.get_text()
            pdf_content.append(page_text)

        pdf_document.close()
    except Exception as e:
        raise Exception(f"Error while extracting PDF data: {str(e)}")
    return pdf_content

In [27]:
# load pdf content
# pdf_path = 'crime-and-punishment.pdf'
# pdf_data = extract_pdf_data(pdf_path)
# pdf_structure = create_pdf_structure(pdf_data)

In [36]:
# save pdf as json
# import json
# def save_json(data):
#     json_string = json.dumps(data)

#     # Serialize to a JSON file
#     with open("book.json", "w") as json_file:
#         json.dump(data, json_file)
# # save_json(pdf_structure)


# def read_json(file_name):
#     with open(file_name, "r") as json_file:
#         data = json.load(json_file)
#     return data

# # load pdf from json
# # pdf_data = read_json()

## Utils

In [29]:
def generate_prompt():
    """
    Generates a prompt for summarizing a given text.

    This function generates a prompt template that instructs the user to write a brief,
    concise, coherent, accurate, and relevant summary of a provided text.

    Returns:
        PromptTemplate: A PromptTemplate instance containing the generated prompt template.
    """
    prompt_template = """Write a breif, concise, coherent , accurate and relevance summary of the following:

    "{text}"
    INSTRUCTION: DO not use "in this passage"


    CONCISE SUMMARY:"""
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
    return PROMPT


def preprocessing(text):
    """
    Preprocesses a text by removing newline characters and a specific watermark.

    This function takes an input text and performs preprocessing by removing newline characters
    and a watermark, such as "Free eBooks at Planet eBook.com," from the text.

    Args:
        text (str): The input text to be preprocessed.

    Returns:
        str: The preprocessed text with newline characters and watermark removed.
    """
    if len(text):
        text = text.replace("\n", '')
        text = text.replace("Free eBooks at Planet eBook.com", "")
    return text



def chunking_data(text):
    """
    Chunk a long text into smaller segments.

    This function takes a long text as input and splits it into smaller segments using
    a text splitter (e.g., CharacterTextSplitter). Each segment becomes a separate document.

    Args:
        text (str): The long text to be segmented.

    Returns:
        List[str]: A list of segmented text segments.
    """
    docs = []
    if len(text):
        text_splitter = CharacterTextSplitter()
        docs = text_splitter.split_text(text)
    return docs



def create_model_object(temperature, model_name, max_token):
    """
    Create an instance of a language model.

    This function initializes a language model object with the specified parameters, including temperature,
    model name, and maximum token limit.

    Args:
        temperature (float): The temperature parameter controlling the randomness of the model's responses.
        model_name (str): The name or identifier of the language model to be used.
        max_token (int): The maximum number of tokens allowed in the model's responses.

    Returns:
        OpenAI: An instance of the language model with the specified configuration.
    """
    llm = OpenAI(temperature=temperature, model_name=model_name, max_tokens=max_token)
    return llm

In [37]:
def generate_summaries(data):
    """
    Generate summaries for the provided data.

    This function processes a dictionary containing text data organized by part and chapter numbers.
    It generates summaries for each chapter's text using a language model and returns the summaries
    in a dictionary format.

    Args:
        data (dict): A dictionary containing text data organized by part and chapter numbers.

    Returns:
        dict: A dictionary containing summaries for each chapter, organized by part and chapter numbers.
    """
    summary_dict = {}
    if len(data):
        llm = create_model_object(0.9, "gpt-3.5-turbo-16k", 500)
        for part_number in data.keys():
            summary_dict[part_number] = {}
            for chapter_number in data[part_number].keys():
                output_summary = ""
                print(f"Processing part: {part_number} and chapter number: {chapter_number}")
                try:
                    clean_chapter_text = preprocessing(data[part_number][chapter_number])
                    docs = chunking_data(clean_chapter_text)
                    prompt = generate_prompt()
                    chain = LLMChain(llm=llm, prompt=prompt)
                    output_summary = chain.run(docs)
                    summary_dict[part_number][chapter_number] = output_summary
                except:
                    summary_dict[part_number][chapter_number] = output_summary
                    # output_summary = chain.run(docs[0])

    return summary_dict


# # pdf chapter wise summaries
# pdf_summaries = generate_summaries(pdf_data)

In [38]:
def transform_json(data):
    
    final_book = {}
    try:
        for part_number in data.keys():
            final_book[part_number] = []
            for chapter in data[part_number].keys():
                final_book[part_number] += [data[part_number][chapter]]
    except ValueError:
        print("Json Value Error")

    return data, final_book

In [32]:
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, PageTemplate, Frame
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.pagesizes import letter, landscape


class PDFUtils:
    """
    A utility class for generating PDF documents with various content.

    Args:
        pdf_file (str): The name of the PDF file to be generated.

    Attributes:
        pdf_template (SimpleDocTemplate): The PDF document template.
        content (list): A list to store the content elements of the PDF.

    Methods:
        page_break(): Create a PageBreak element.
        generate_page_title(title_name="Crime and Punishment Summary"): Generate a title page for the PDF.
        space_between_paragraph(width=1, height=12): Create a Spacer element to add space between paragraphs.
        paragraph_style(name, font_size, alignment, space_after, fontName="Times-Roman"): Define a custom ParagraphStyle.
        generate_summary_pdf(data): Generate a PDF document containing summary content.
    """

    def __init__(self, pdf_file="Book_summary"):
        """
               Initializes the PDF template and content list.

               Args:
                   pdf_file (str): The name of the PDF file to be generated.
               """
        # Initialize pdf template
        page_width, page_height = landscape(letter)
        left_margin = 1.5 * 72
        right_margin = 1.5 * 72
        top_margin = 1 * 72
        bottom_margin = 1.5 * 72
        self.pdf_template = SimpleDocTemplate(pdf_file, pagesize=(page_width, page_height),
                                              leftMargin=left_margin, rightMargin=right_margin,
                                              topMargin=top_margin, bottomMargin=bottom_margin)
        self.content = []

    def page_break(self):
        """
        Create a PageBreak element.

       Returns:
        PageBreak: A PageBreak element.
       """
        return PageBreak()

    def generate_page_title(self, title_name="Crime and Punishment Summary"):
        """
        Generate a title page for the PDF.

        Args:
            title_name (str): The title text for the page.

        Returns:
            list: A list of content elements for the title page.
        """
        title_page_style = getSampleStyleSheet()['Title']
        title_page = Paragraph(title_name, title_page_style)
        content = [self.space_between_paragraph(10, 100), title_page, self.page_break()]
        return content

    def space_between_paragraph(self, width=1, height=12):
        """
        Create a Spacer element to add space between paragraphs.

        Args:
            width (int): The width of the spacer.
            height (int): The height of the spacer.

        Returns:
            Spacer: A Spacer element.
        """
        return Spacer(width, height)

    def paragraph_style(self, name, font_size, alignment, space_after, fontName="Times-Roman"):
        """
        Define a custom ParagraphStyle.

        Args:
            name (str): The name of the ParagraphStyle.
            font_size (int): The font size.
            alignment (int): The text alignment (0=left, 1=center, 2=right).
            space_after (int): The space after the paragraph.
            fontName (str, optional): The font name. Defaults to "Times-Roman".

        Returns:
            ParagraphStyle: A custom ParagraphStyle.
        """
        return ParagraphStyle(name=name, fontSize=font_size, alignment=alignment, spaceAfter=space_after,
                              fontName=fontName)

    def generate_summary_pdf(self, data):
        """
        Generate a PDF document containing summary content.

        Args:
            data (dict): A dictionary containing the summary data organized by part number.

        Returns:
            None
        """
        try:
            chapter_style = self.paragraph_style("ChapterTitle", 20, 1, 12)
            paragraph_style = self.paragraph_style("CustomStyle", 12, 4, 20)

            page_title = self.generate_page_title()
            self.content.extend(page_title)

            for part_number, paragraphs in data.items():
                chapter_title = Paragraph(part_number, chapter_style)
                self.content.append(chapter_title)
                self.content.append(self.space_between_paragraph(1, 12))
                for paragraph in paragraphs:
                    chapter_content = Paragraph(paragraph, paragraph_style)
                    self.content.extend([chapter_content, self.space_between_paragraph(1, 12)])
                    self.content.extend([self.space_between_paragraph(1, 12)])
                self.content.append(self.page_break())
            self.pdf_template.build(self.content)
            print("Pdf successfully generated")
        except:
            print("Pdf generation error")


In [33]:
def driver(pdf_path='crime-and-punishment.pdf'):
    data_dict = []
    pdf_summaries = []
    # Step 1: Parse pdf
    pdf_data = extract_pdf_data(pdf_path)
    pdf_structure = create_pdf_structure(pdf_data)
    
    # Step 2: Generate summaries
    pdf_summaries = generate_summaries(pdf_structure)
    # Step 3: Transform the dictinary
    if len(pdf_summaries):
        data, data_dict = transform_json(pdf_summaries)
    
    # Step 4: Generate resultant pdf
    pdf_object = PDFUtils("final_book_summary.pdf")
    if len(data_dict):
        pdf_object.generate_summary_pdf(data_dict)
    
    

In [34]:
driver()

Processing part: Part I and chapter number: Chapter I
Processing part: Part I and chapter number: Chapter II
Processing part: Part I and chapter number: Chapter III
Processing part: Part I and chapter number: Chapter IV
Processing part: Part I and chapter number: Chapter V
Processing part: Part I and chapter number: Chapter VI
Processing part: Part I and chapter number: Chapter VII
Processing part: Part II and chapter number: Chapter I
Processing part: Part II and chapter number: Chapter II
Processing part: Part II and chapter number: Chapter III
Processing part: Part II and chapter number: Chapter IV
Processing part: Part II and chapter number: Chapter V
Processing part: Part II and chapter number: Chapter VI
Processing part: Part II and chapter number: Chapter VII
Processing part: Part III and chapter number: Chapter I
Processing part: Part III and chapter number: Chapter II
Processing part: Part III and chapter number: Chapter III
Processing part: Part III and chapter number: Chapte