In [33]:
"""
Intake the ipcc_statements_dataset.tsv file.
Get the test split only.
Get the context for each sentence and create a new CSV column. 
"""

'\nIntake the ipcc_statements_dataset.tsv file.\nGet the test split only.\nGet the context for each sentence and create a new CSV column. \n'

In [74]:
import pandas as pd


In [75]:
# Load raw dataset (sentences that have been extracted, with confidence labels)
full_dataset = pd.read_csv('data/ipcc_statements_dataset.tsv', sep="\t")
# Get test split only
test_set = full_dataset[full_dataset["split"]=="test"]
test_set=test_set.reset_index(drop=True)

In [76]:
test_set.head()

Unnamed: 0,statement_idx,report,page_num,sent_num,statement,confidence,score,split
0,3,AR6_WGI,24,2,"Since 1750, increases in CO2 (47%) and CH4 (15...",very high,3,test
1,42,AR6_WGI,37,16,"Over the next 2000 years, global mean sea leve...",low,0,test
2,77,AR6_WGI,47,7,"By the end of the century, scenarios with very...",high,2,test
3,81,AR6_WGI,62,2,"Over the past millennium, and especially since...",medium,1,test
4,86,AR6_WGI,63,8,The paleo context supports the assessment that...,high,2,test


In [77]:
# Load all sentences, including unlabeled (to be used for context)
all_sentences_raw = pd.read_csv('data/text_processing/all_sentences.csv')

In [78]:
all_sentences_raw.head()

Unnamed: 0.1,Unnamed: 0,filenames,page_num,sent_num,text
0,0,data/raw/IPCC_AR6_WGI_FullReport.pdf,0,0,WGIThe Physical Science BasisClimate Change 20...
1,1,data/raw/IPCC_AR6_WGI_FullReport.pdf,1,0,Climate Change 2021 The Physical Science Basis...
2,2,data/raw/IPCC_AR6_WGI_FullReport.pdf,2,0,© 2021 Intergovernmental Panel on Climate Change.
3,3,data/raw/IPCC_AR6_WGI_FullReport.pdf,2,1,Electronic copies of this report are available...
4,4,data/raw/IPCC_AR6_WGI_FullReport.pdf,2,2,Use the following reference to cite the entire...


In [79]:
# Get context for a labeled sentence.
def get_context(report, pg_num, sent_num, 
                n_sentences_before=5, n_sentences_after=2): # Context is more likely to be before, than after.
    # report = row["report"]
    # pg_num = row["page_num"]
    # sent_num = row["sent_num"]
    filename = f"data/raw/IPCC_{report}_FullReport.pdf"

    # print(f"report: {report}, pg_num: {pg_num}, sent_num: {sent_num}, filename: {filename}")
    try:
        # Find the row index of the entry
        filtered_df = all_sentences_raw[all_sentences_raw['filenames'] == filename].reset_index(drop=True)
        # print(filtered_df.shape)
        row_index = filtered_df[(filtered_df['page_num'] == pg_num) &
                                (filtered_df['sent_num'] == sent_num)].index[0]
        # print(row_index)
        
        # Get the indices of the rows before and after the target row
        indices = list(range(max(0, row_index - n_sentences_before), min(row_index + n_sentences_after, len(filtered_df))))
        # print(indices)

        # Concatenate the sentences
        context = " ".join(filtered_df.loc[indices, 'text'])
        return context
    except IndexError:
        print("Entry not found.")
        return None
    except Exception as e:
        print("An error occurred:", e)
        return None


In [80]:
test_set["context"] = test_set.apply(lambda x: get_context(x["report"], x["page_num"], x["sent_num"]), axis=1)

In [81]:
test_set.to_csv("data/ipcc_test_set_with_context.csv", index=False)

In [82]:
test_set.tail()

Unnamed: 0,statement_idx,report,page_num,sent_num,statement,confidence,score,split,context
295,8024,AR6_WGIII,1827,13,REDD+ can significantly contribute to climate ...,high,2,test,2020; Toxopeus and Polzin 2021; UNEP et al. 20...
296,8056,AR6_WGIII,2011,5,Large-scale bioenergy projects with CCS may be...,medium,1,test,17SM-3 17SM-2 Chapter 17 Supplementary Materia...
297,8067,AR6_WGIII,2012,8,Reforestation and forest restoration can have ...,medium,1,test,Water management for reducing drought and adap...
298,8086,AR6_WGIII,2016,8,BEVs could create jobs associated with the EV ...,low,0,test,"Simultaneoulsy, smart charging of EVs can supp..."
299,8093,AR6_WGIII,2058,21,For pathways that limit warming to 2°C (>67%) ...,high,2,test,Parties to the Paris Agreement decided to repo...


In [83]:
print(get_context("AR6_WGIII", 2012, 8))

Water management for reducing drought and adapting to climate change is important issue (high confidence)± Probably no direct impact (soil–human health nexus through nutritional transfer: may contribute to better nutrient security through quality and nutrient-rich products and better living if higher prof- its and diversified products) (medium confidence)+ Better landscape water balance by influencing the quality and availability of water supply (high confidence)+ Better soil management can lead to improved productivity and thus economic growth (medium confidence)± Low environment footprints, quality and healthy food production and economic and social viability (high confidence)+ Clear climate benefit (high confidence)+ Better sponge function to life in water, and less nutrients into the water (medium confidence)+ Proved beneficial for combating soil degradation and improving soil health and beneficial to biodiversity (high confidence)+ Securing local food production and higher and sta

In [84]:
all_sentences_raw[all_sentences_raw["filenames"]=="data/raw/IPCC_AR6_WGIII_FullReport.pdf"].tail()

Unnamed: 0.1,Unnamed: 0,filenames,page_num,sent_num,text
389307,389307,data/raw/IPCC_AR6_WGIII_FullReport.pdf,2252,0,2025 IndexIndexthermoelectric power generation...
389308,389308,data/raw/IPCC_AR6_WGIII_FullReport.pdf,2253,0,Index2026 Indexenergy sector CO 2 emissions 62...
389309,389309,data/raw/IPCC_AR6_WGIII_FullReport.pdf,2254,0,2027 IndexIndexvehicle emissions standards 138...
389310,389310,data/raw/IPCC_AR6_WGIII_FullReport.pdf,2255,0,Index2028 Indexcircular economy and recycling ...
389311,389311,data/raw/IPCC_AR6_WGIII_FullReport.pdf,2256,0,2029 IndexIndexcity and sub-national networks ...
