# 0. Installing Transformers and Importing Dependencies

In [1]:
#!pip install transformers

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization")

In [2]:
from transformers import pipeline

# 1. Load Summarization Pipeline

In [3]:
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


# 2. Summarize Text

In [4]:
ARTICLE = """
You don’t always have to give your boss the finger
Maybe it’s your first day on the job. Perhaps your manager just made an announcement. You’ve been asked to scan your fingerprint every time you clock in and out. Is that even allowed?
From Hooters to Hyatt Hotels, employers tantalized by the promise of a futuristic, streamlined way to track workers’ attendance are starting to use time clock machines that fingerprint employees.
Vendors like Kronos and Allied Time say that because the machines are tied to your biometric information — unique characteristics such as your face, fingerprints, how you talk, and even how you walk — they provide a higher level of workplace security and limit employees’ ability to commit “time theft” by punching in for one another.
But the benefits for your boss may come at a cost to you — both your privacy and possibly your health.
With the global outbreak of COVID-19, your personal health could be at risk when using frequently touched screens and fingerprint scanners. The Centers for Disease Control says that coronavirus can remain on surfaces for hours, so screens and scanners should be regularly disinfected with cleaning spray or wipes. And you should wash your hands for 20 seconds or use alcohol-based hand sanitizer immediately after using one.
In addition to these health concerns, critics argue that biometric devices pose massive personal security issues, exposing workers to potential identity theft and subjecting them to possible surveillance from corporations and law enforcement.
In an amicus brief in a case before a federal court of appeals, a group of privacy advocates, including the ACLU and the EFF, wrote that “the immutability of biometric information” puts people “at risk of irreparable harm in the form of identity theft and/or tracking.”
“You can get a new phone, you can change your password, you can even change your Social Security number; you can’t change your face,” said Kade Crockford, the Technology for Liberty program director at ACLU of Massachusetts.
Companies facing legal action over their use of the machines range from fast food joints like McDonald’s and Wendy’s, to hotel chains like Marriott and Hyatt, to airlines like United and Southwest.
In some cases, the companies have countered in the lawsuits that their employees’ union agreement allows the use of the machines: “Southwest and United contend that the plaintiffs’ unions have consented — either expressly or through the collective bargaining agreements’ management-rights clauses — and that any required notice has been provided to the unions,” the court’s opinion states.
Other companies have not responded to requests for comment or have said they cannot comment on active litigation.
Privacy and labor laws have lagged behind the shifts in the American workplace. But in some places, you have the right to refuse and even sue.

Biometric Privacy Laws
As the collection and use of biometrics has exploded, lawmakers in three states have responded by passing laws restricting its deployment.
"""

In [5]:
summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False)

[{'summary_text': ' Employers are starting to use time clock machines that fingerprint employees . The machines are tied to your unique characteristics such as your face, fingerprints, how you talk, and even how you walk . The Centers for Disease Control says that coronavirus can remain on surfaces for hours .'}]

In [None]:
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain

# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

# Import GenerativeModel from google.generativeai
from google.generativeai import GenerativeModel

# Set the API key for generative ai
GenerativeModel.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Loading the Gemini Pro model
model = GenerativeModel('gemini-pro')

# Function to summarize the text/PDF
def summarize_pdf(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=100)
    chunks = text_splitter.create_documents([text])

    # Define prompts for summarization
    map_prompt_template = "Please summarize the below pdf file:\nfile:`{text}`\nSummary:"
    final_combine_prompt_template = "Provide a final summary of the entire pdf file with these important points.\nAdd a Generic meaningful Title,\nStart the precise summary with an introduction and provide the\nsummary in number points for the pdf file.\nfile: `{text}`"

    # Load summarization chain with prompts and model
    summary_chain = load_summarize_chain(
        llm=model,
        chain_type='map_reduce',
        map_prompt=map_prompt_template,
        combine_prompt=final_combine_prompt_template,
        verbose=False
    )

    # Run summarization chain on chunks of text
    output = summary_chain.run(chunks)
    return output

# Streamlit app structure
st.set_page_config(page_title="PDF Context and Summary Extractor")
st.header("PDF Context and Summary Extractor")
uploaded_file = st.file_uploader("Upload PDF file", type="pdf")

if uploaded_file:
    pdf_reader = PdfReader(uploaded_file)
    text = ""
    
    # Extract text from each page of PDF file
    for page in pdf_reader.pages:
        text += page.extract_text()

    if st.button("Show Summary"):
        summary = summarize_pdf(text)
        st.write("Summary:", summary)


In [None]:
import os
import PyPDF2
from functools import reduce
from google.generativeai import genai
from google.generativeai import GenerationConfig
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.
os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

model = genai.GenerativeModel('gemini-pro')

generation_config = GenerationConfig(
    temperature=0.7,
    top_p=1.0,
    top_k=32,
    candidate_count=1,
    max_output_tokens=1024,
)

# Define model_with_limit_and_backoff function here

def summarize_pdf(uploaded_file):
    CHUNK_SIZE = 2  # number of overlapping pages

    reader = PyPDF2.PdfReader(uploaded_file)
    pages = reader.pages

    initial_summary = []

    for i in range(0, len(pages), CHUNK_SIZE):
        pages_to_merge = [x for x in range(i, i + CHUNK_SIZE) if x < len(pages)]

        extracted_texts = [pages[x].extract_text() for x in pages_to_merge]

        text = "\n".join(extracted_texts)

        prompt = initial_prompt_template.format(text=text)

        summary = model_with_limit_and_backoff(prompt=prompt, generation_config=generation_config).text

        initial_summary.append(summary)

        if pages_to_merge[-1] == len(reader.pages):
            break

    summary = reduce(initial_summary, final_prompt_template)
    
    return summary


# Streamlit app structure (assuming you're using Streamlit)
st.set_page_config(page_title="PDF Context and Summary Extractor")
st.header("PDF Context and Summary Extractor")
uploaded_file = st.file_uploader("Upload PDF file", type="pdf")

if uploaded_file:
    if st.button("Show Summary"):
        summary = summarize_pdf(uploaded_file)
        st.write("Summary:", summary)


In [None]:
import backoff
import ratelimit
import google.generativeai as genai
from google.generativeai import GenerativeModel
from google.generativeai import GenerationConfig
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.
os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Loading the Gemini Pro model
model = genai.GenerativeModel('gemini-pro')

generation_config = GenerationConfig(
    temperature=0.7,
    top_p=1.0,
    top_k=32,
    candidate_count=1,
    max_output_tokens=1024,
)

CALL_LIMIT = 20  # Number of calls to allow within a period
ONE_MINUTE = 60  # One minute in seconds

# A function to print a message when the function is retrying
def backoff_hdlr(details):
    print(
        "Backing off {} seconds after {} tries".format(
            details["wait"], details["tries"]
        )
    )

@backoff.on_exception(  # Retry with exponential backoff strategy when exceptions occur
    backoff.expo,
    (
        Exception,  # Add exceptions you want to retry on here
    ),
    max_time=300,  # Maximum time for retries in seconds (5 minutes)
    on_backoff=backoff_hdlr,  # Function to call when retrying
)
@ratelimit.limits(  # Limit the number of calls to the model per minute
    calls=CALL_LIMIT, period=ONE_MINUTE
)
def model_with_limit_and_backoff(**kwargs):
    return model.generate_content(**kwargs)


In [None]:
import os
import PyPDF2
from functools import reduce
import backoff
import ratelimit
import google.generativeai as genai
from google.generativeai import GenerativeModel, GenerationConfig
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.
os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Loading the Gemini Pro model
model = genai.GenerativeModel('gemini-pro')

generation_config = GenerationConfig(
    temperature=0.7,
    top_p=1.0,
    top_k=32,
    candidate_count=1,
    max_output_tokens=1024,
)

CALL_LIMIT = 20  # Number of calls to allow within a period
ONE_MINUTE = 60  # One minute in seconds

# A function to print a message when the function is retrying
def backoff_hdlr(details):
    print(
        "Backing off {} seconds after {} tries".format(
            details["wait"], details["tries"]
        )
    )

@backoff.on_exception(  # Retry with exponential backoff strategy when exceptions occur
    backoff.expo,
    (
        Exception,  # Add exceptions you want to retry on here
    ),
    max_time=300,  # Maximum time for retries in seconds (5 minutes)
    on_backoff=backoff_hdlr,  # Function to call when retrying
)
@ratelimit.limits(  # Limit the number of calls to the model per minute
    calls=CALL_LIMIT, period=ONE_MINUTE
)
def model_with_limit_and_backoff(**kwargs):
    return model.generate_content(**kwargs)


def summarize_pdf(uploaded_file):
    CHUNK_SIZE = 2  # number of overlapping pages

    reader = PyPDF2.PdfReader(uploaded_file)
    pages = reader.pages

    initial_summary = []

    for i in range(0, len(pages), CHUNK_SIZE):
        pages_to_merge = [x for x in range(i, i + CHUNK_SIZE) if x < len(pages)]

        extracted_texts = [pages[x].extract_text() for x in pages_to_merge]

        text = "\n".join(extracted_texts)

        initial_prompt_template = "Initial prompt template here"
        final_prompt_template = "Final prompt template here"

        prompt = initial_prompt_template.format(text=text)

        summary = model_with_limit_and_backoff(prompt=prompt, generation_config=generation_config).text

        initial_summary.append(summary)

        if pages_to_merge[-1] == len(reader.pages):
            break

    summary = reduce(initial_summary, final_prompt_template)
    
    return summary


# Streamlit app structure (assuming you're using Streamlit)
st.set_page_config(page_title="PDF Context and Summary Extractor")
st.header("PDF Context and Summary Extractor")
uploaded_file = st.file_uploader("Upload PDF file", type="pdf")

if uploaded_file:
    if st.button("Show Summary"):
        summary = summarize_pdf(uploaded_file)
        st.write("Summary:", summary)


In [None]:
## Original lanchain and vertex code

In [None]:
import streamlit as st
import os
from PyPDF2 import PdfReader # For reading the PDF
import google.generativeai as genai
from google.generativeai import GenerativeModel
from google.generativeai import GenerationConfig
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

load_dotenv()  # take environment variables from .env.

os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Loading the Gemini Pro model
model = genai.GenerativeModel('gemini-pro')

chunks_prompt="""
Please summarize the below pdf file:
file:`{text}'
Summary:
"""

final_combine_prompt='''
Provide a final summary of the entire pdf file with these important points.
Add a Generic meaningful Title,
Start the precise summary with an introduction and provide the
summary in number points for the pdf file.
file: `{text}`
'''

# Function to summarize the text/PDF
# The model's output is treated as a condensed version of the original text, highlighting the main ideas.
def summarize_pdf(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=100)
    chunks = text_splitter.create_documents([text])
    
    map_prompt_template=PromptTemplate(input_variables=['text'],
                                    template=chunks_prompt)
    final_combine_prompt_template=PromptTemplate(input_variables=['text'],
                                             template=final_combine_prompt)
    summary_chain = load_summarize_chain(
    llm= model,
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=final_combine_prompt_template,
    verbose=False
    )
    output = summary_chain.run(chunks)
    return output


# Streamlit app structure
st.set_page_config(page_title="PDF Context and Summary Extractor")
st.header("PDF Context and Summary Extractor")
uploaded_file = st.file_uploader("Upload PDF file", type="pdf")

if uploaded_file:
    pdf_reader = PdfReader(uploaded_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()

    if st.button("Show Summary"):
        summary = summarize_pdf(text)
        st.write("Summary:", summary)


In [None]:
import re
import urllib
import warnings
from pathlib import Path
import backoff
import pandas as pd
import ratelimit
from tqdm import tqdm
import streamlit as st
import os
from PyPDF2 import PdfReader # For reading the PDF
import google.generativeai as genai
from google.generativeai import GenerativeModel
from google.generativeai import GenerationConfig
from dotenv import load_dotenv


initial_prompt_template = """
    Write a concise summary of the following text delimited by triple backquotes.

    ```{text}```

    CONCISE SUMMARY:
"""

final_prompt_template = """
    Write a concise summary of the following text delimited by triple backquotes.
    Return your response in bullet points which covers the key points of the text.

    ```{text}```

    BULLET POINT SUMMARY:
"""

def summarize_pdf(uploaded_file):
    CHUNK_SIZE = 2  # number of overlapping pages

    # Read the PDF file and create a list of pages
    reader = PyPDF2.PdfReader(pdf_file)
    pages = reader.pages

    # Create an empty list to store the summaries
    initial_summary = []

    # Iterate over the pages and generate a summary for a few pages as one chunk based on `CHUNK_SIZE`
    for i in tqdm(range(len(pages))):
        # Select a list of pages to merge as one chunk
        pages_to_merge = [x for x in range(i, i + CHUNK_SIZE) if x < len(pages)]

        extracted_texts = [text_from_pages[x] for x in pages_to_merge]

        # Concatenate the
        text = "\n".join(extracted_texts)

        # Create a prompt for the model using the concatenated text and a prompt template
        prompt = initial_prompt_template.format(text=text)

        # Generate a summary using the model and the prompt
        summary = model_with_limit_and_backoff(prompt=prompt, generation_config=generation_config,
        stream=True).text

        # Append the summary to the list of summaries
        initial_summary.append(summary)

        # If the last page is reached, break the loop
        if pages_to_merge[-1] == len(reader.pages):
            break
        
        
    # Use defined `reduce` function to summarize the summaries
    summary = reduce(initial_summary, final_prompt_template)

    print(summary)

# Streamlit app structure
st.set_page_config(page_title="PDF Context and Summary Extractor")
st.header("PDF Context and Summary Extractor")
uploaded_file = st.file_uploader("Upload PDF file", type="pdf")

if uploaded_file:
    if st.button("Show Summary"):
        summary = summarize_pdf(uploaded_file)
        st.write("Summary:", summary)

In [None]:
import streamlit as st
import os
from PyPDF2 import PdfReader  # For reading the PDF
from dotenv import load_dotenv
from google.generativeai import GenerativeModel
import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain

load_dotenv()  # take environment variables from .env.

os.getenv("GOOGLE_API_KEY")  # Ensure API key is present, used by LangChain internally

# Set the API key for generative ai
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Loading the Gemini Pro model
model = genai.GenerativeModel('gemini-pro')

map_combine_prompt = """
**Summarize each chunk (max 150 tokens):**
file: `{chunk_text}`

**Combine and refine the summaries:**
Add a title based on the key points...
Start with an introduction summarizing the main points...
List the main points in bullet points...
file: `{entire_text}`
"""

# Streamlit app structure
st.set_page_config(page_title="PDF Context and Summary Extractor")
st.header("PDF Context and Summary Extractor")
uploaded_file = st.file_uploader("Upload PDF file", type="pdf")

if uploaded_file:
    pdf_reader = PdfReader(uploaded_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()

    if st.button("Show Summary"):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
        chunks = text_splitter.create_documents([text])

        map_prompt_template = PromptTemplate(input_variables=["chunk_text"], template=map_combine_prompt)
        # Configure LangChain with Gemini Pro LLM
        summary_chain = load_summarize_chain(
            chain_type="map_reduce",
            map_prompt=map_prompt_template,
            llm= model,  # Explicitly specify Gemini Pro model
            verbose=False,
        )

        # Run the summarized chain with extracted text and prompts
        output = summary_chain.run(chunks)

        # Clean and print the final summary
        final_summary = "\n".join(output["combined_output"][0].splitlines())
        st.write("Summary:", final_summary)