In [1]:
!pip install -r ../requirements.txt

Collecting torch==2.0.1 (from -r ../requirements.txt (line 1))
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1->-r ../requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1->-r ../requirements.txt (line 1))
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1->-r ../requirements.txt (line 1))
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1->-r ../requirements.txt (line 1))
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1->-r ../requirements.txt (line 1))
  D

In [2]:
# !pip install pypdf

In [23]:
import requests
import pypdf
import re
from langchain_community.retrievers import TFIDFRetriever
from langchain_core.documents import Document
from difflib import SequenceMatcher

def download_pdf(url, output_path):
    response = requests.get(url)
    with open(output_path, 'wb') as file:
        file.write(response.content)

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = pypdf.PdfReader(file)
        pages_text = []
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            pages_text.append(page.extract_text())
    return pages_text

def split_into_paragraphs(text):
    paragraphs = text.split('\n\n')
    return [para.strip() for para in paragraphs if para.strip()]

def create_retriever_from_texts(texts):
    return TFIDFRetriever.from_texts(texts)

def extract_top_paragraphs_from_url(pdf_url, user_input, top_n=5):
    pdf_path = "downloaded.pdf"
    download_pdf(pdf_url, pdf_path)
    pages_text = extract_text_from_pdf(pdf_path)
    
    all_paragraphs = []
    for page_text in pages_text:
        paragraphs = split_into_paragraphs(page_text)
        all_paragraphs.extend(paragraphs)
    
    retriever = create_retriever_from_texts(all_paragraphs)
    result = retriever.invoke(user_input)
    
    # Use difflib to find the most similar paragraphs
    similarities = [(para, SequenceMatcher(None, user_input, para).ratio()) for para in [doc.page_content for doc in result]]
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    top_paragraphs = [para for para, _ in similarities[:top_n]]
    
    return top_paragraphs


In [24]:
# Example usage
pdf_url = "https://pirls2021.org/wp-content/uploads/2024/01/P21_Insights_StudentWellbeing.pdf"
user_input = "Give me an overview about bullying at schools and explore different results per country"
top_paragraphs = extract_top_paragraphs_from_url(pdf_url, user_input)
for i, paragraph in enumerate(top_paragraphs, 1):
    print(f"Paragraph {i}: {paragraph}\n")

Paragraph 1: PIRLS INSIGHTS Series
 
 STUDENT WELL-BEING AND READING ACHIEVEMENT IN PIRLS 2021     7
Student Bullying 
Being subjected to bullying behaviors is a negative experience for students, so is hypothesized to 
be a negative contributor to student well-being. The ten items in the PIRLS 2021 Student Bullying 
scale asked students to report how often they experienced different bullying behaviors. The items 
include bullying behaviors that can occur face-to-face or online. Although frequent experience 
of bullying likely leads to negative affect at school, it is important to acknowledge that the scale 
items do not ask students to report how experiencing the bullying incidents made them feel.
Examination of the results in Exhibit 2 reveals noteworthy patterns and associations 
between student bullying and reading achievement. This exhibit shows the prevalence of bullying 
experiences among fourth-grade students across the PIRLS 2021 participating countries and 
reveals a negative 

In [5]:
import requests
import pypdf
import re
from sentence_transformers import SentenceTransformer, util
import os


def extract_top_paragraphs_from_url(pdf_url, user_input, top_n=2):
    """
    Downloads a PDF from a given URL, extracts text from it, splits the text into paragraphs,
    and returns the top N paragraphs most similar to the user input using Sentence-BERT.

    Args:
        pdf_url (str): The URL of the PDF file to download.
        user_input (str): The input text to compare against the paragraphs in the PDF.
        top_n (int): The number of top similar paragraphs to return. Default is 3.

    Returns:
        list: A list of the top N paragraphs most similar to the user input.
    """
    
    def download_pdf(url, output_path):
        """
        Downloads a PDF from a given URL and saves it to the specified output path.

        Args:
            url (str): The URL of the PDF file to download.
            output_path (str): The local file path to save the downloaded PDF.
        """
        response = requests.get(url)
        with open(output_path, 'wb') as file:
            file.write(response.content)

    def extract_text_from_pdf(pdf_path):
        """
        Extracts text from a PDF file.

        Args:
            pdf_path (str): The path to the PDF file.

        Returns:
            list: A list of strings, each representing the text of a page in the PDF.
        """
        with open(pdf_path, "rb") as file:
            reader = pypdf.PdfReader(file)
            pages_text = []
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                pages_text.append(page.extract_text())
        return pages_text

    def split_into_paragraphs(text):
        """
        Splits text into paragraphs.

        Args:
            text (str): The text to split into paragraphs.

        Returns:
            list: A list of paragraphs.
        """
        paragraphs = text.split('\n\n')
        return [para.strip() for para in paragraphs if para.strip()]
    
    # Determine the local file name based on the URL
    pdf_filename = os.path.basename(pdf_url)
    pdf_path = pdf_filename
    
    # Download the PDF
    download_pdf(pdf_url, pdf_path)
    
    # Extract text from the PDF
    pages_text = extract_text_from_pdf(pdf_path)
    
    # Split text into paragraphs
    all_paragraphs = []
    for page_text in pages_text:
        paragraphs = split_into_paragraphs(page_text)
        all_paragraphs.extend(paragraphs)
    
    # Load the Sentence-BERT model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    # Encode the paragraphs and the user input
    paragraph_embeddings = model.encode(all_paragraphs)
    user_input_embedding = model.encode([user_input])[0]

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(user_input_embedding, paragraph_embeddings)[0]
    top_indices = similarities.argsort(descending=True)[:top_n]

    # Get the top N paragraphs
    top_paragraphs = [all_paragraphs[i] for i in top_indices]

    return top_paragraphs


In [6]:
# Example usage
pdf_url = "https://pirls2021.org/wp-content/uploads/2024/01/P21_Insights_StudentWellbeing.pdf"
user_input = "Give me an overview about bullying at schools and explore different results per country"
top_paragraphs = extract_top_paragraphs_from_url(pdf_url, user_input)
for i, paragraph in enumerate(top_paragraphs, 1):
    print(f"Paragraph {i}: {paragraph}\n")

Paragraph 1: PIRLS  INSIGHTS  Series 
 STUDENT WELL-BEING AND READING ACHIEVEMENT IN PIRLS 2021     9On average, 88 percent of students across countries reported experiencing bullying “Never 
or Almost Never” or “About Monthly.” Unfortunately, 12 percent of students on average across 
countries reported experiencing bullying “About Weekly,” with six countries having one-fifth of 
students or more in this category. 
The international average results show a strong association between the frequency of 
bullying incidents and average reading achievement. The data indicate that students who face 
more frequent bullying tend to exhibit lower reading achievement. Students subjected to weekly 
bullying encounters demonstrated the lowest average reading achievement compared to the other 
groups across countries (451 scale score points compared to 519 and 495). In some countries, 
a striking average advantage of 100 reading scale score points or more is observed for students 
who reported being 