In [55]:
import pandas as pd

# Clean the Training dataset


In [56]:
full_dataset = pd.read_csv('data/ipcc_statements_dataset.tsv', sep="\t")
train_set = full_dataset[full_dataset["split"]=="train"]
original_sentences = pd.read_csv('data/text_processing/sentences_with_ratings_all.csv')

In [57]:
train_set.head()


Unnamed: 0,statement_idx,report,page_num,sent_num,statement,confidence,score,split
0,0,AR6_WGI,20,22,"Since 2011 (measurements reported in AR5), con...",high,2,train
1,1,AR6_WGI,21,8,Mid-latitude storm tracks have likely shifted ...,medium,1,train
2,2,AR6_WGI,21,18,The average rate of sea level rise was 1.3 [0....,high,2,train
4,4,AR6_WGI,24,4,Temperatures during the most recent decade (20...,medium,1,train
5,5,AR6_WGI,24,5,"Prior to that, the next most recent warm perio...",medium,1,train


# Define acronyms

In [58]:
TEST_SET_ACRONYMS = {
    "GHG": "greenhouse gas",
    "GHGs": "greenhouse gases",
    "CIDs": "climactic impact drivers",
    "OHC": "ocean heat content",
    "GMSL": "global mean sea level",
    "MASL": "meters above sea level",
    "CMIP5": "Coupled Model Intercomparison Project 5",
    "CMIP6": "Coupled Model Intercomparison Project 6",
    "ECS": "equilibrium climate sensitivity",
    "TCR": "transient climate response",
    "SSP": "shared socioeconomic pathway",
    "AR5": "the 5th Assessment Report",
    "ERF": "effective radiative forcing",
    "MPWP": "mid-Pliocene warm period, 3.3 to 3.0 million years ago",
    "EECO": "early Eocene climatic optimum, 50 million years ago",
    "SH": "Southern Hemisphere",
    "NH": "Northern Hemisphere",
    "SROCC": "Special Report on the Ocean and Cryosphere in a Changing Climate",
    "GMST": "global-scale annual mean surface temperature",
    "SST": "sea surface temperatures",
    "NAO/NAM": "North Atlantic Oscillation and Northern Annular Mode",
    "CDR": "carbon dioxide removal",
    "TCRE": "transient climate response to cumulative emissions of carbon dioxide",
    "PM": "particulate matter",
    "SLCFs": "short-lived climate forcers",
    "ERFaci": "effective radiative forcing from cloud–aerosol interactions",
    "INPs": "ice nucleating particles",
    "SRM": "solar radiation management",
    "P–E": "precipitation minus evaporation",
    "HC": "Hadley Circulation",
    "RCP": "representative concentration pathway",
    "RCPs": "representative concentration pathways",
    "WAIS": "West Antarctic Ice Sheet",
    "SR1.5": "Special Report on the impacts of global warming of 1.5 °C above pre-industrial levels",
    "SLE": "sea level equivalent",
    "MICI": "Marine Ice Cliff Instability",
    "ENSO": "El Niño Southern Oscillation",
    "SAH": "Sahara",
    "NEAF": "North Eastern Africa",
    "SEAF": "South Eastern Africa",
    "ESAF": "East Southern Africa",
    "MDG": "Madagascar",
    "SAS": "South Asia",
    "NWS": "North-Western South America",
    "MED": "South Europe and the Mediterranean",
    "SLR": "sea level rise",
    "EBUS": "Eastern Boundary Upwelling Systems",
    "MHWs": "marine heatwaves",
    "HABs": "harmful algal blooms",
    "GWL": "global warming levels",
    "NbS": "nature-based solutions",
    "TCs": "tropical cyclones",
    "RKRs": "representative key risks",
    "CRD": "climate resilient development",
    "NDCs": "nationally determined contributions",
    "COP26": "26th Conference of Parties",
    "DAC": "direct air capture",
    "DACCS": "direct air capture with carbon storage",
    "BECCS": "biomass energy with carbon capture and storage",
    "IPRs": "intellectual property rights",
    "SDGs": "sustainable development goals",
    "AFOLU": "agriculture, forestry, and other land use",
    "GWP100": "global warming potential over the next 100 years",
    "PES": "payment for ecosystem services",
    "BEV": "battery electric vehicles",
    "CCS": "carbon capture and storage",
    "REDD+": "reducing emissions from deforestation and forest degradation in developing countries"
}

TEST_SET_ABBREV = {
    "RCP": "representative concentration pathway",
    "SSP": "shared socioeconomic pathway"
}

# Helper functions

In [59]:
import re

def remove_references_and_sections(text):
    # Remove parenthetical references with unspecified number of references like (Author and Author, YYYY; Author et al., YYYY)
    text = re.sub(r'\((?:[A-Za-z]+(?: and [A-Za-z]+)?(?:,? \d{4})+(?:; )?|[A-Za-z]+ et al\.(?:,? \d{4})+(?:; )?)+\)', '', text)
    # Remove single author parenthetical references
    text = re.sub(r'\([A-Za-z]+, \d{4}\)', '', text)
    # Remove section indicators like {16.2.3.7}
    text = re.sub(r'\{.*?\}', '', text)
    return text

def replace_acronyms(text):
    replaced_acronyms = set()
    for acronym, expansion in TEST_SET_ACRONYMS.items():
        pattern = re.compile(r'\b' + re.escape(acronym) + r'\b')
        if acronym not in replaced_acronyms and expansion not in text and pattern.search(text):
            text = pattern.sub(f"{expansion} ({acronym})", text, count=1)
            replaced_acronyms.add(acronym)
    for acronym, expansion in TEST_SET_ABBREV.items():
        pattern = re.compile(r'\b' + re.escape(acronym) + r'\d')
        if acronym not in replaced_acronyms and expansion not in text and pattern.search(text):
            text = pattern.sub(f"{expansion} {acronym}", text, count=1)
            replaced_acronyms.add(acronym)
    return text



# Apply cleaning to dataset


In [60]:
original_sentences.rename(columns={"Unnamed: 0": "statement_idx"}, inplace=True)
original_sentences.head()

Unnamed: 0,statement_idx,filenames,page_num,sent_num,text,confidence_rating
0,0,data/raw/IPCC_AR6_WGI_FullReport.pdf,20,22,"Since 2011 (measurements reported in AR5), con...",high
1,1,data/raw/IPCC_AR6_WGI_FullReport.pdf,21,8,Mid-latitude storm tracks have likely shifted ...,medium
2,2,data/raw/IPCC_AR6_WGI_FullReport.pdf,21,18,The average rate of sea level rise was 1.3 [0....,high
3,3,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,2,"Since 1750, increases in CO 2 (47%) and CH 4 (...",very high
4,4,data/raw/IPCC_AR6_WGI_FullReport.pdf,24,4,Temperatures during the most recent decade (20...,medium


In [61]:
train_set = train_set.join(original_sentences["text"], on="statement_idx")

In [62]:
train_set.head()

Unnamed: 0,statement_idx,report,page_num,sent_num,statement,confidence,score,split,text
0,0,AR6_WGI,20,22,"Since 2011 (measurements reported in AR5), con...",high,2,train,"Since 2011 (measurements reported in AR5), con..."
1,1,AR6_WGI,21,8,Mid-latitude storm tracks have likely shifted ...,medium,1,train,Mid-latitude storm tracks have likely shifted ...
2,2,AR6_WGI,21,18,The average rate of sea level rise was 1.3 [0....,high,2,train,The average rate of sea level rise was 1.3 [0....
4,4,AR6_WGI,24,4,Temperatures during the most recent decade (20...,medium,1,train,Temperatures during the most recent decade (20...
5,5,AR6_WGI,24,5,"Prior to that, the next most recent warm perio...",medium,1,train,"Prior to that, the next most recent warm perio..."


In [63]:
train_set['final_statement'] = train_set["statement"].apply(lambda x: replace_acronyms(remove_references_and_sections(x)))

In [64]:
train_set.head()


Unnamed: 0,statement_idx,report,page_num,sent_num,statement,confidence,score,split,text,final_statement
0,0,AR6_WGI,20,22,"Since 2011 (measurements reported in AR5), con...",high,2,train,"Since 2011 (measurements reported in AR5), con...",Since 2011 (measurements reported in the 5th A...
1,1,AR6_WGI,21,8,Mid-latitude storm tracks have likely shifted ...,medium,1,train,Mid-latitude storm tracks have likely shifted ...,Mid-latitude storm tracks have likely shifted ...
2,2,AR6_WGI,21,18,The average rate of sea level rise was 1.3 [0....,high,2,train,The average rate of sea level rise was 1.3 [0....,The average rate of sea level rise was 1.3 [0....
4,4,AR6_WGI,24,4,Temperatures during the most recent decade (20...,medium,1,train,Temperatures during the most recent decade (20...,Temperatures during the most recent decade (20...
5,5,AR6_WGI,24,5,"Prior to that, the next most recent warm perio...",medium,1,train,"Prior to that, the next most recent warm perio...","Prior to that, the next most recent warm perio..."


# Get context

In [65]:
# Load all sentences, including unlabeled (to be used for context)
all_sentences_raw = pd.read_csv('data/text_processing/all_sentences.csv')

In [66]:
all_sentences_raw.head()

Unnamed: 0.1,Unnamed: 0,filenames,page_num,sent_num,text
0,0,data/raw/IPCC_AR6_WGI_FullReport.pdf,0,0,WGIThe Physical Science BasisClimate Change 20...
1,1,data/raw/IPCC_AR6_WGI_FullReport.pdf,1,0,Climate Change 2021 The Physical Science Basis...
2,2,data/raw/IPCC_AR6_WGI_FullReport.pdf,2,0,© 2021 Intergovernmental Panel on Climate Change.
3,3,data/raw/IPCC_AR6_WGI_FullReport.pdf,2,1,Electronic copies of this report are available...
4,4,data/raw/IPCC_AR6_WGI_FullReport.pdf,2,2,Use the following reference to cite the entire...


In [67]:
# Get context for a labeled sentence.
def get_context(report, pg_num, sent_num, 
                n_sentences_before=5, n_sentences_after=2): # Context is more likely to be before, than after.
    # report = row["report"]
    # pg_num = row["page_num"]
    # sent_num = row["sent_num"]
    filename = f"data/raw/IPCC_{report}_FullReport.pdf"

    # print(f"report: {report}, pg_num: {pg_num}, sent_num: {sent_num}, filename: {filename}")
    try:
        # Find the row index of the entry
        filtered_df = all_sentences_raw[all_sentences_raw['filenames'] == filename].reset_index(drop=True)
        # print(filtered_df.shape)
        row_index = filtered_df[(filtered_df['page_num'] == pg_num) &
                                (filtered_df['sent_num'] == sent_num)].index[0]
        # print(row_index)
        
        # Get the indices of the rows before and after the target row
        indices = list(range(max(0, row_index - n_sentences_before), min(row_index + n_sentences_after, len(filtered_df))))
        # print(indices)

        # Concatenate the sentences
        context = " ".join(filtered_df.loc[indices, 'text'])
        return context
    except IndexError:
        print("Entry not found.")
        return None
    except Exception as e:
        print("An error occurred:", e)
        return None


In [68]:
train_set["context"] = train_set.apply(lambda x: get_context(x["report"], x["page_num"], x["sent_num"]), axis=1)

In [None]:
train_set.head()


Unnamed: 0,statement_idx,report,page_num,sent_num,statement,confidence,score,split,text,final_statement,context
0,0,AR6_WGI,20,22,"Since 2011 (measurements reported in AR5), con...",high,2,train,"Since 2011 (measurements reported in AR5), con...",Since 2011 (measurements reported in the 5th A...,"New climate model simulations, new analyses, a..."
1,1,AR6_WGI,21,8,Mid-latitude storm tracks have likely shifted ...,medium,1,train,Mid-latitude storm tracks have likely shifted ...,Mid-latitude storm tracks have likely shifted ...,"Additionally, methodological advances and new ..."
2,2,AR6_WGI,21,18,The average rate of sea level rise was 1.3 [0....,high,2,train,The average rate of sea level rise was 1.3 [0....,The average rate of sea level rise was 1.3 [0....,It is very likely that human influence has con...
4,4,AR6_WGI,24,4,Temperatures during the most recent decade (20...,medium,1,train,Temperatures during the most recent decade (20...,Temperatures during the most recent decade (20...,"{Cross-Chapter Box 2.3, 3.3.1, 6.4.2, 7.3}Obse..."
5,5,AR6_WGI,24,5,"Prior to that, the next most recent warm perio...",medium,1,train,"Prior to that, the next most recent warm perio...","Prior to that, the next most recent warm perio...",8SPM Summary for PolicymakersA.2 The scale of ...


In [None]:
train_set.rename(columns={"text": "original_statement"}, inplace=True)

In [None]:
train_set.head()

Unnamed: 0,statement_idx,report,page_num,sent_num,statement,confidence,score,split,original_statement,final_statement,context
0,0,AR6_WGI,20,22,"Since 2011 (measurements reported in AR5), con...",high,2,train,"Since 2011 (measurements reported in AR5), con...",Since 2011 (measurements reported in the 5th A...,"New climate model simulations, new analyses, a..."
1,1,AR6_WGI,21,8,Mid-latitude storm tracks have likely shifted ...,medium,1,train,Mid-latitude storm tracks have likely shifted ...,Mid-latitude storm tracks have likely shifted ...,"Additionally, methodological advances and new ..."
2,2,AR6_WGI,21,18,The average rate of sea level rise was 1.3 [0....,high,2,train,The average rate of sea level rise was 1.3 [0....,The average rate of sea level rise was 1.3 [0....,It is very likely that human influence has con...
4,4,AR6_WGI,24,4,Temperatures during the most recent decade (20...,medium,1,train,Temperatures during the most recent decade (20...,Temperatures during the most recent decade (20...,"{Cross-Chapter Box 2.3, 3.3.1, 6.4.2, 7.3}Obse..."
5,5,AR6_WGI,24,5,"Prior to that, the next most recent warm perio...",medium,1,train,"Prior to that, the next most recent warm perio...","Prior to that, the next most recent warm perio...",8SPM Summary for PolicymakersA.2 The scale of ...


In [None]:
train_set = train_set[["statement_idx", "report", "page_num", "sent_num", "original_statement", "final_statement", "confidence", "score", "split", "context"]]

In [None]:
train_set.head()

Unnamed: 0,statement_idx,report,page_num,sent_num,original_statement,final_statement,confidence,score,split,context
0,0,AR6_WGI,20,22,"Since 2011 (measurements reported in AR5), con...",Since 2011 (measurements reported in the 5th A...,high,2,train,"New climate model simulations, new analyses, a..."
1,1,AR6_WGI,21,8,Mid-latitude storm tracks have likely shifted ...,Mid-latitude storm tracks have likely shifted ...,medium,1,train,"Additionally, methodological advances and new ..."
2,2,AR6_WGI,21,18,The average rate of sea level rise was 1.3 [0....,The average rate of sea level rise was 1.3 [0....,high,2,train,It is very likely that human influence has con...
4,4,AR6_WGI,24,4,Temperatures during the most recent decade (20...,Temperatures during the most recent decade (20...,medium,1,train,"{Cross-Chapter Box 2.3, 3.3.1, 6.4.2, 7.3}Obse..."
5,5,AR6_WGI,24,5,"Prior to that, the next most recent warm perio...","Prior to that, the next most recent warm perio...",medium,1,train,8SPM Summary for PolicymakersA.2 The scale of ...


In [None]:
train_set.to_csv("data/ipcc_train_set_original_cleaned_context.csv")

# Merge Train set and Test set into one file

In [None]:
train_set = pd.read_csv("data/ipcc_train_set_original_cleaned_context.csv")
test_set = pd.read_csv("data/ipcc_test_set_original_cleaned_context.csv")

In [None]:
final_dataset = pd.concat([train_set, test_set])
final_dataset = final_dataset.sort_values(by="statement_idx")
final_dataset = final_dataset.reset_index(drop=True)

In [None]:
final_dataset["has_confidence_in_final_statement"] = final_dataset["final_statement"].apply(lambda x: "confidence" in x)

In [None]:
final_dataset.head()

Unnamed: 0.1,Unnamed: 0,statement_idx,report,page_num,sent_num,original_statement,final_statement,confidence,score,split,context,has_confidence_in_final_statement
0,0,0,AR6_WGI,20,22,"Since 2011 (measurements reported in AR5), con...",Since 2011 (measurements reported in the 5th A...,high,2,train,"New climate model simulations, new analyses, a...",False
1,1,1,AR6_WGI,21,8,Mid-latitude storm tracks have likely shifted ...,Mid-latitude storm tracks have likely shifted ...,medium,1,train,"Additionally, methodological advances and new ...",False
2,2,2,AR6_WGI,21,18,The average rate of sea level rise was 1.3 [0....,The average rate of sea level rise was 1.3 [0....,high,2,train,It is very likely that human influence has con...,False
3,0,3,AR6_WGI,24,2,"Since 1750, increases in CO 2 (47%) and CH 4 (...","Since 1750, increases in CO2 (47%) and CH4 (15...",very high,3,test,Estimates account for both direct emissions in...,False
4,4,4,AR6_WGI,24,4,Temperatures during the most recent decade (20...,Temperatures during the most recent decade (20...,medium,1,train,"{Cross-Chapter Box 2.3, 3.3.1, 6.4.2, 7.3}Obse...",False


In [None]:
final_dataset.to_csv("data/ipcc_statements_dataset_original_cleaned_context.csv")