In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

import itertools

import re
import string
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def clean_text(line):
    """ Cleans a given line of text by converting to lowercase, removing numbers, whitespaces and punctuation
        Parameters
        ----------
        line :  string
                A singe line of text

        Returns
        ----------
        clean_line :  string
                      A cleaned line of text
    """
    
    # make all text lower case
    clean_line = line.lower()
    
    # remove punctuation
    punct = set(string.punctuation)
    clean_line = "".join([ch for ch in clean_line if ch not in punct])

    # remove numbers left over from enumeration
    clean_line = re.sub(r"[0-9\n]", "", clean_line)

    #clean whitespace at beginning and end of each line
    clean_line = clean_line.strip()

    return clean_line

In [4]:
def get_risk_benefit_section_name(filepath):
    """ Get the section names that contain keywords 'risk' or 'benefit' from EPAR table of contents
        Parameters
        ----------
        filepath :  string
                    The path to the EPAR pdf file.

        Returns
        ----------
        section_name :  list of strings
                        The EPAR section names containing the keywords 'risk' or 'benefit'
    """


    output_string = StringIO()

    # read text from pdf

    with open(filepath, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # assumption: table of contents is located at 2nd page of document
        page = next(itertools.islice(PDFPage.create_pages(doc),1,2))
        interpreter.process_page(page)
    
    # create dataframe for table of contents only containing lines that have text
    toc = pd.DataFrame("".join(s for s in output_string.getvalue().splitlines(True) if re.findall("[a-zA-Z]+", s)).splitlines(), columns=['text_raw'])
    
    # clean text
    toc['text_clean'] = toc.text_raw.apply(clean_text)
    
    # search for sections containing keywords 'risk' and 'benefit'
    toc['relevant_section'] = (toc['text_clean'].str.contains('risk'))|(toc['text_clean'].str.contains('benefit'))
    
    return toc.loc[toc.relevant_section,'text_clean']
    


In [5]:
def get_relevant_sentences_from_section(section_name, filepath):
    """ Get sentences from specified section
        Parameters
        ----------
        section_name :  string
                        The name of the section
        filepath :      string
                        The path to the EPAR pdf file.

        Returns
        ----------
        section_sentences :  list of strings
                             The sentences contained in the section
    """
    
    output_string = StringIO()

    # read text from pdf

    with open(filepath, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for i, page in enumerate(PDFPage.create_pages(doc)):
            #skip title and table of contents page
            if i < 2:
                continue
            interpreter.process_page(page)
    
    # create dataframe for document text
    df_text = pd.DataFrame("".join(s for s in output_string.getvalue().splitlines(True) if re.findall("[a-zA-Z]+", s)).splitlines(), columns=['text_raw'])

    # clean text
    df_text['text_clean'] = df_text.text_raw.apply(clean_text)
    
    # search for relevant sections
    df_text['is_relevant_section_title'] = (df_text['text_clean'].str.contains(section_name))
    
    # search for start of relevant section - is equal to index of first apppearance of relevant section plus 1
    sec_start = df_text.loc[df_text.is_relevant_section_title,'text_clean'].index[0]+1
    
    relevant_text = "".join(text for text in df_text.loc[sec_start:,'text_raw'])
    
    relevant_sentences = pd.DataFrame(nltk.sent_tokenize(relevant_text), columns=['sentence'])
    
    return relevant_sentences

In [6]:
filenames = ['WC500057122.pdf', 'WC500135744.pdf']

In [7]:
# read and save sentences for both files

for file in filenames:
    
    filepath = f'data/{file}'
    section_name = get_risk_benefit_section_name(filepath).iloc[0]
    sentences = get_relevant_sentences_from_section(section_name, filepath)
    sentences.to_csv('data/sentences_{}.csv'.format(file.strip('.pdf')), index=False)