In [1]:
import PyPDF2
import pandas as pd

# Open the pdf file
pdf1 = PyPDF2.PdfReader('Resources/pubmed.pdf')
pdf2 = PyPDF2.PdfReader('Resources/pubmed2.pdf')
pdfs = [pdf1, pdf2]
# Create a dataframe to store the data from each page
texts=[]
names=[]
count=1
for pdf in pdfs:
    name = f'pdf{count}'
    count+=1
    # Get the number of pages in the pdf file
    num_pages = len(pdf.pages)

    # Iterate over the pages in the pdf file
    for i in range(num_pages):

        # Get the text from the current page
        page = pdf.pages[i]
        text = page.extract_text()

        # Split the text into sections
        sections = text.split('\n\n')
        texts.append(sections[0])
        names.append(name)

# Add the section to the dataframe
pubmed_df = pd.DataFrame({'doc': names, 'page_text': texts})
pubmed_df

Unnamed: 0,doc,page_text
0,pdf1,\n \nSince January 2020 Elsevier has created ...
1,pdf1,RESEARCH\nCharacteristics of online pharmacies...
2,pdf1,illegitimate and in violation of U.S. pharmacy...
3,pdf1,"illegal, unsafe, or misleading activities like..."
4,pdf1,The safety characteristics of all online pharm...
5,pdf1,phone number listed and an offer to speak with...
6,pdf1,without a prescription.22Our results demonstra...
7,pdf1,"marketplace. In the meantime, organizations sh..."
8,pdf2,The new england journal of medicinen engl j me...
9,pdf2,"n engl j med 380;12 nejm.org March 21, 2019 11..."


## Summarizing with T5

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Set up the summarization pipeline
tokenizer = AutoTokenizer.from_pretrained("t5-base", max_input_length = 1024)
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Define a function to generate summaries for each section
def generate_summary(text):
    # Generate a summary of the text using the summarization pipelineb
     
    summary = summarizer(text, max_length=80, min_length=25, do_sample=False)[0]['summary_text']
    return summary

2023-11-09 16:47:18.088039: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
# Add a new column to the dataframe for the summaries
pubmed_df['Summary'] = pubmed_df['page_text'].apply(generate_summary)

  "You have modified the pretrained model configuration to control generation. This is a"
Token indices sequence length is longer than the specified maximum sequence length for this model (1104 > 512). Running this sequence through the model will result in indexing errors


In [4]:
# Specify the file name for the document
output_file = 'Output/Pubmed_Summaries.txt'

# Open the file in write mode and write each 'Summary' to a new line
with open(output_file, 'w') as file:
    for summary in pubmed_df['Summary']:
        file.write(summary + '\n')