In [92]:
import PyPDF2 as pdf

# Paths to the PDFs
files = [
    'data/raw/IPCC_AR6_WGI_FullReport.pdf',
    'data/raw/IPCC_AR6_WGII_FullReport.pdf',
    'data/raw/IPCC_AR6_WGIII_FullReport.pdf'
    ]

In [93]:
# Iterate through files in list, extract raw text\# takes ~1 min / 100 pages on my macbook.
# NUM_PAGES = 1000


documents = []
for file in files: 
    document = {}

    read_pdf = pdf.PdfReader(file)
    print(read_pdf.metadata)
    document['filepath'] = file
    document['metadata'] = read_pdf.metadata
    document['text_by_page'] = [page.extract_text() for page in read_pdf.pages]
    documents.append(document)


{'/Author': 'IPCC AR6 Working Group I', '/CreationDate': "D:20220725145825+02'00'", '/Creator': 'Adobe Acrobat Pro DC (32-bit) 22.1.20169', '/ModDate': "D:20220726075121-04'00'", '/Producer': 'Adobe Acrobat Pro DC (32-bit) 22.1.20169', '/Title': 'Climate Change 2021: The Physical Science Basis'}


In [74]:
# save as a checkpoint since this can take a while.
import pickle
with open('data/text_processing/raw_text_allp.pickle', "wb+") as output:
    pickle.dump(documents, output)

In [75]:
# View original text

# for document in documents: 
#     for page in document['text_by_page']:
        # print(page)

In [76]:
# Define preprocessing functions 
import re 

def normalize_whitespace(input):
    # remove newlines, tabs, and double spaces, convert them all to single spaces
    return " ".join(input.split())

def strip_paren_whitespace(input):
    # Remove whitespace before close paren and after open paren
    return re.sub('(\s([?,.!"]))|(?<=\[|\()(.*?)(?=\)|\])', lambda x: x.group().strip(), input)

for document in documents:
    document['text_by_page_processed'] = [strip_paren_whitespace(normalize_whitespace(page)) for page in document['text_by_page']]


In [77]:
# Try splitting into sentences with nltk based on https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
import nltk
from nltk import sent_tokenize

for document in documents:
    document['text_by_page_sentences'] = [sent_tokenize(page) for page in document['text_by_page_processed']]


In [91]:
for document in documents:
    for page in document['text_by_page_sentences'][-1000:-900]:
        for sentence in page:
            print(sentence)

WGIThe Physical Science BasisClimate Change 2021 Working Group I Contribution to the Sixth Assessment Report of the Intergovernmental Panel on Climate Change
Climate Change 2021 The Physical Science Basis Working Group I Contribution to the Sixth Assessment Report of the Intergovernmental Panel on Climate Change Edited by Valérie Masson-Delmotte Panmao Zhai Co-Chair Working Group I Co-Chair Working Group I Anna Pirani Sarah L. Connors Clotilde Péan Head of TSU Head of Science Team Head of Operations Yang Chen Leah Goldfarb Melissa I. Gomis Senior Science officer Senior Science officer Senior Science officer J.B. Robin Matthews Sophie Berger Mengtian Huang Senior Science officer Science Officer Science Officer Ozge Yelekçi Rong Yu Baiquan Zhou Science Officer Science Officer Science Officer Elisabeth Lonnoy Thomas K. Maycock Tim Waterfield Project Assistant Science Editor IT Officer Katherine Leitzell Nada Caud Communication Manager Outreach Manager Working Group I Technical Support Uni

In [84]:
# Extract sentences that have a single confidence label attached, at the end of the sentence, in parentheses. Split it into sentence/statement and the confidence label(s) as a tuple. 

regex = '([A-Z]+.*)\((((very )*low|(very )*high|low|medium) confidence)\).*\.'

filenames = []
page_nums = []
sent_nums = []
texts = []
confidence_ratings = []

for filename, document in zip(files, documents):
    for page_num, page in enumerate(document['text_by_page_sentences']):
        for sent_num, sentence in enumerate(page):
            match = re.match(regex, sentence)
            if match is not None:
                text, _, confidence_rating = match.groups()[:3]
                filenames.append(filename)
                page_nums.append(page_num)
                sent_nums.append(sent_num)
                texts.append(text)
                confidence_ratings.append(confidence_rating)
assert len(texts) == len(confidence_ratings)
print(len(confidence_ratings))




4137


In [85]:
# Put in a dataframe and export to CSV.
import pandas as pd
sentences_with_ratings_df = pd.DataFrame(list(zip(filenames, page_nums, sent_nums, texts, confidence_ratings)), columns = ['filenames', 'page_num', 'sent_num', 'text', 'confidence_rating'])

sentences_with_ratings_df.to_csv('data/text_processing/sentences_with_ratings_{}p.csv'.format(NUM_PAGES))
sentences_with_ratings_df.head(30)


Unnamed: 0,filenames,page_num,sent_num,text,confidence_rating
0,data/raw/IPCC_AR6_WGI_FullReport.pdf,20,22,"Since 2011 (measurements reported in AR5), con...",high
1,data/raw/IPCC_AR6_WGI_FullReport.pdf,21,8,Mid-latitude storm tracks have likely shifted ...,medium
2,data/raw/IPCC_AR6_WGI_FullReport.pdf,21,18,The average rate of sea level rise was 1.3 [0....,high
3,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,2,"Since 1750, increases in CO 2 (47%) and CH 4 (...",very high
4,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,4,Temperatures during the most recent decade (20...,medium
5,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,5,"Prior to that, the next most recent warm perio...",medium
6,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,7,Late summer Arctic sea ice area was smaller th...,medium
7,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,8,The global nature of glacier retreat since the...,medium
8,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,10,The global ocean has warmed faster over the pa...,medium
9,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,11,A long-term increase in surface open ocean pH ...,high


In [86]:
# TODO: handle other cases
# Another common case: extract sentences that have multiple confidence labels within the sentence, in parentheses. Split into sentence (without confidence labels) and the labels.
# Handle cases where the confidence label is included in the text (eg '... with low confidence, and')
# Handle cases where the confidence label is located outside of a sentence, at the end of a paragraph. (Seems difficult to handle).
sentences_with_ratings_df.confidence_rating.value_counts()

high         2448
medium       1353
very high     214
low           122
Name: confidence_rating, dtype: int64

In [None]:
# Scrape specifically for very low cases