In [16]:
import pickle as pkl
import PyPDF2 as pdf
# from dotenv import load_dotenv # pip install python-dotenv
# load_dotenv()   # Set API KEY values from .env file


In [17]:

# Paths to the PDFs
files = [
    'data/raw/IPCC_AR6_WGI_FullReport.pdf',
    'data/raw/IPCC_AR6_WGII_FullReport.pdf',
    'data/raw/IPCC_AR6_WGIII_FullReport.pdf'
    ]

In [18]:
# Iterate through files in list, extract raw text\# takes ~1 min / 100 pages on my macbook.
# NUM_PAGES = 1000


documents = []
for file in files: 
    document = {}

    read_pdf = pdf.PdfReader(file)
    print(read_pdf.metadata)
    document['filepath'] = file
    document['metadata'] = read_pdf.metadata
    document['text_by_page'] = [page.extract_text() for page in read_pdf.pages]
    documents.append(document)


{'/Author': 'IPCC AR6 Working Group I', '/CreationDate': "D:20220725145825+02'00'", '/Creator': 'Adobe Acrobat Pro DC (32-bit) 22.1.20169', '/ModDate': "D:20220726075121-04'00'", '/Producer': 'Adobe Acrobat Pro DC (32-bit) 22.1.20169', '/Title': 'Climate Change 2021: The Physical Science Basis'}


In [None]:
# Define preprocessing functions 
import re 

def normalize_whitespace(input):
    # remove newlines, tabs, and double spaces, convert them all to single spaces
    return " ".join(input.split())

def strip_paren_whitespace(input):
    # Remove whitespace before close paren and after open paren
    return re.sub('(\s([?,.!"]))|(?<=\[|\()(.*?)(?=\)|\])', lambda x: x.group().strip(), input)

for document in documents:
    document['text_by_page_processed'] = [strip_paren_whitespace(normalize_whitespace(page)) for page in document['text_by_page']]


In [None]:
# Try splitting into sentences with nltk based on https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
import nltk
from nltk import sent_tokenize

for document in documents:
    document['text_by_page_sentences'] = [sent_tokenize(page) for page in document['text_by_page_processed']]


In [None]:
# Extract sentences that have a single confidence label attached, at the end of the sentence, in parentheses. Split it into sentence/statement and the confidence label(s) as a tuple. 

regex = '([A-Z]+.*)\((((very )*low|(very )*high|low|medium) confidence)\).*\.'

filenames = []
page_nums = []
sent_nums = []
texts = []
confidence_ratings = []

for filename, document in zip(files, documents):
    for page_num, page in enumerate(document['text_by_page_sentences']):
        for sent_num, sentence in enumerate(page):
            match = re.match(regex, sentence)
            if match is not None:
                text, _, confidence_rating = match.groups()[:3]
                filenames.append(filename)
                page_nums.append(page_num)
                sent_nums.append(sent_num)
                texts.append(text)
                confidence_ratings.append(confidence_rating)
assert len(texts) == len(confidence_ratings)
print(len(confidence_ratings))




1085


In [None]:
# Put in a dataframe and export to CSV.
import pandas as pd
sentences_with_ratings_df = pd.DataFrame(list(zip(filenames, page_nums, sent_nums, texts, confidence_ratings)), columns = ['filenames', 'page_num', 'sent_num', 'text', 'confidence_rating'])

sentences_with_ratings_df.to_csv('data/text_processing/sentences_with_ratings_04_20.csv')


In [None]:
# Collect ALL sentences (for context retrieval) and put into a csv.

In [None]:
filenames_all = []
page_nums_all = []
sent_nums_all = []
texts_all = []

for filename, document in zip(files, documents):
    for page_num, page in enumerate(document['text_by_page_sentences']):
        for sent_num, sentence in enumerate(page):
            filenames_all.append(filename)
            page_nums_all.append(page_num)
            sent_nums_all.append(sent_num)
            texts_all.append(sentence)
print(len(texts_all))


5642


In [None]:
all_sentences_df = pd.DataFrame(list(zip(filenames_all, page_nums_all, sent_nums_all, texts_all)), columns = ['filenames', 'page_num', 'sent_num', 'text'])

all_sentences_df.to_csv('data/text_processing/all_sentences.csv')
