In [2]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import os
import pdfplumber
import spacy
import pandas as pd
from transformers import pipeline

# Load NLP model for text processing (spaCy)
nlp = spacy.load("en_core_web_sm")

# Initialize the summarization pipeline using Hugging Face Transformers
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to extract content from the PDF file
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()  # Extracts text from each page
    return text

# Function to summarize the content in manageable chunks
def summarize_large_content(content, chunk_size=1000):
    # Split content into chunks of chunk_size
    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]

    summaries = []
    for chunk in chunks:
        try:
            # Summarize each chunk
            summary = summarizer(chunk, max_length=500, min_length=100, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Error summarizing chunk: {e}")
            summaries.append(chunk)  # If error, append the original chunk

    # Combine all summaries into one
    full_summary = " ".join(summaries)
    return full_summary

# Function to load data into a structured format (e.g., CSV)
def load_to_csv(data, filename="journal_insights.csv"):
    current_directory = os.getcwd()  # Get the current working directory
    full_path = os.path.join(current_directory, filename)

    df = pd.DataFrame(data, columns=["Title", "PDF_Path", "Summary"])
    df.to_csv(full_path, index=False)

    print(f"Data saved to {full_path}")  # Print the full path

# Main ETL Process
def etl_pipeline(pdf_path, title):
    print(f"\nExtracting content from: {title}")

    # Extract full content from PDF
    content = extract_text_from_pdf(pdf_path)

    if content:
        print("Summarizing the extracted content...")

        # Summarize the content (handling large text)
        summary = summarize_large_content(content)

        # Load data to CSV
        load_to_csv([[title, pdf_path, summary]])

# Path to the PDF file
pdf_path = "/content/coffee.pdf"  # Replace with your PDF file path
pdf_title = "Is Coffee Harmful? If Looking for Longevity, Say Yes to the Coffee, No to the Sugar"  # Set the appropriate title

# Run ETL pipeline
etl_pipeline(pdf_path, pdf_title)


Device set to use cuda:0



Extracting content from: Is Coffee Harmful? If Looking for Longevity, Say Yes to the Coffee, No to the Sugar


Your max_length is set to 500, but your input_length is only 334. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=167)


Summarizing the extracted content...


Your max_length is set to 500, but your input_length is only 338. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=169)
Your max_length is set to 500, but your input_length is only 352. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=176)
Your max_length is set to 500, but your input_length is only 291. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=145)
Your max_length is set to 500, but your input_length is only 320. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1

Data saved to /content/journal_insights.csv
