## Prepare Data for Decontextualization

In [1]:
"""
Get paragraphs (context) from the dataset. 
Format: SQuAD Format (https://github.com/rajpurkar/SQuAD-explorer/tree/master/dataset)
"""
import json
data_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/dev-v1.1.json"
with open(data_path) as dataset_file:
    dataset = json.load(dataset_file)['data']
count = 0
count_passage = 0
new_dataset = []
for passage in dataset:
    count_passage += 1
    for paragraph in passage['paragraphs']:
        count += 1
        new_paragraph = {}
        new_paragraph['context'] = paragraph['context']
        new_paragraph['id'] = str(count)
        new_dataset.append(new_paragraph)
to_save = {'data': new_dataset}
json_object = json.dumps(to_save, indent=4)
save_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/paragraph-dev-v1.1.json"
with open(save_path, "w") as outfile:
    outfile.write(json_object)

In [2]:
"""
Divide the big paragraph file into smaller files for better it with checkpoints.
"""
import math
import os
paragraph_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/paragraph-dev-v1.1.json"
num_chunks = 20         # Divide the full SQuAD dev set into num_chunks for tracking the progress of decontextualization
save_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/paragraph-dev-segment"
with open(paragraph_path) as dataset_file:
    questions = json.load(dataset_file)['data']
question_chunk = math.ceil(len(questions) / num_chunks)
if not os.path.exists(save_path):
    os.makedirs(save_path)

for i in range(num_chunks):
    new_dataset = {}
    new_dataset['version'] = str(i+1)
    start_index = i*question_chunk
    end_index = min(len(questions), (i+1)*question_chunk)
    new_dataset['data'] = questions[start_index:end_index]

    #Save this
    json_object = json.dumps(new_dataset, indent=4)
    with open(save_path+ "/" +str(i+1) + ".json", "w") as outfile:
        outfile.write(json_object)

## Decontextualization SQuAD

For further information about Decontextualization, see [Decontextualization: Making Sentences Stand-Alone](https://arxiv.org/abs/2102.05169).

In [12]:
import tensorflow as tf
print(tf.__version__)

2.13.0


In [13]:
from os import path

import tensorflow as tf
import tensorflow_text  # Required to run exported model.

MODEL_SIZE = "base" #@param["base", "3B", "11B"]

DATASET_BUCKET = 'gs://decontext_dataset'

SAVED_MODELS = {
  "base": f'{DATASET_BUCKET}/t5_base/1611267950',
  "3B": f'{DATASET_BUCKET}/t5_3B/1611333896',
  "11B": f'{DATASET_BUCKET}/t5_11B/1605298402'
}

SAVED_MODEL_PATH = SAVED_MODELS[MODEL_SIZE]
DEV = path.join(DATASET_BUCKET, 'decontext_dev.jsonl')
SAVED_MODEL_PATH = path.join(DATASET_BUCKET, 't5_base/1611267950')

def load_predict_fn(model_path):
  print("Loading SavedModel in eager mode.")
  imported = tf.saved_model.load(model_path, ["serve"])
  return lambda x: imported.signatures['serving_default'](
      tf.constant(x))['outputs'].numpy()

predict_fn = load_predict_fn(SAVED_MODEL_PATH)

def decontextualize(input):
  return predict_fn([input])[0].decode('utf-8')

Loading SavedModel in eager mode.


2023-07-31 18:00:52.868764: W tensorflow/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".
2023-07-31 18:00:54.858563: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-31 18:00:54.896953: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above a



In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def segment_context(context):
    doc = nlp(context)
    sentences = [str(sentence) for sentence in doc.sents]
    return sentences
def create_input(paragraph, target_sentence_idx, page_title='', section_title=''):
    """Creates a single Decontextualization example input for T5.

    Args:
      paragraph: List of strings. Each string is a single sentence.
      target_sentence_idx: Integer index into `paragraph` indicating which
        sentence should be decontextualized.
      page_title: Optional title string. Usually Wikipedia page title.
      section_title: Optional title of section within page.
    """
    prefix = ' '.join(paragraph[:target_sentence_idx])
    target = paragraph[target_sentence_idx]
    suffix = ' '.join(paragraph[target_sentence_idx + 1:])
    return ' [SEP] '.join((page_title, section_title, prefix, target, suffix))
def segment_dataset(data_path, save_path):
    new_dataset = []
    with open(data_path) as dataset_file:
        dataset = json.load(dataset_file)['data']
    for para in dataset:
        sentences = segment_context(para['context'])

        new_para = {}
        new_para['id'] = para['id'] + "_" + "0"
        new_para['context'] = sentences[0]
        new_dataset.append(new_para)
        for i in range(1, len(sentences)):
            de_sent = decontextualize(create_input(sentences, i, "", ""))
            new_para = {}
            new_para['id'] = para['id'] + "_" + str(i)
            new_para['context'] = de_sent
            new_dataset.append(new_para)
    to_save = {'data':new_dataset}
    with open(save_path, 'w') as save_file:
        json.dump(to_save, save_file, indent=4)
    return

In [None]:
# Take around 100 mins using A100 from Google Colab
original_dir = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/paragraph-dev-segment-original"        # Path to context to decontextualize
target_dir = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/paragraph-dev-segment-target"            # Save path
for i in range(1, 21):
    file_name = str(i)+".json"          # Tracking progress
    print("Start {}".format(file_name))
    original_path = os.path.join(original_dir, file_name)
    target_path = os.path.join(target_dir, file_name)
    segment_dataset(original_path, target_path)
    print("Finish {}".format(file_name))

Start 1.json
Finish 1.json
Start 2.json
Finish 2.json
Start 3.json
Finish 3.json
Start 4.json
Finish 4.json
Start 5.json
Finish 5.json
Start 6.json
Finish 6.json
Start 7.json
Finish 7.json
Start 8.json
Finish 8.json
Start 9.json
Finish 9.json
Start 10.json
Finish 10.json
Start 11.json
Finish 11.json
Start 12.json
Finish 12.json
Start 13.json
Finish 13.json
Start 14.json
Finish 14.json
Start 15.json
Finish 15.json
Start 16.json
Finish 16.json
Start 17.json
Finish 17.json
Start 18.json
Finish 18.json
Start 19.json
Finish 19.json
Start 20.json
Finish 20.json


In [None]:
import json
import os
def preprocess_file(file_path):
    with open(file_path) as dataset_file:
        dataset = json.load(dataset_file)['data']
    for i, context in enumerate(dataset):
        new_context = context['context'].split("####")[-1].strip()
        dataset[i] = {'id': context['id'], 'context': new_context}
    return dataset

target_dir = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/paragraph-dev-segment-target"
full_dataset = []
for i in range(1, 21):
    print(i)            # Tracking progress
    file_name = str(i)+".json"
    file_path = os.path.join(target_dir, file_name)
    full_dataset.extend(preprocess_file(file_path))
save_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/full_decontextualization.json" # Path for saving full_decontextualization
to_save = {'data': full_dataset}
json_object = json.dumps(to_save, indent=4)
with open(save_path, "w") as outfile:
    outfile.write(json_object)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


## Match the Decontextualized Sentences with questions

In [None]:
"""
Merge Decontextualization (Downloaded from Google Colab)
"""
def merge_context_list(decontextualization_path):
    with open(decontextualization_path) as dataset_file:
        dataset = json.load(dataset_file)['data']
    all_context = {}
    for context in dataset:
        first, second = context['id'].split("_")
        if second == '0':
            all_context[first] = {second:context['context']}
        else:
            all_context[first][second] = context['context']
    return all_context

def match_question_sentences(decontextualization_path, original_data_path, paragraph_path, save_path):
    """
    INPUT:
        decontextualization_path: 
        original_data_path: 
        paragraph_path: 
        save_path: 
    """
    context_dict = merge_context_list(decontextualization_path)
    with open(original_data_path) as dataset_file:
        dataset = json.load(dataset_file)['data']
    with open(paragraph_path) as paragraph_file:
        paragraphs = json.load(paragraph_file)['data']
    
    new_dataset = []
    count_pass = 0                  # If there is any failure to match question with sentences
    for question in dataset:
        paragraph_id = 'not_found'
        for p in paragraphs:
            if p['context'] == question['context']:
                paragraph_id = p['id']
        if paragraph_id == 'not_found':
            count_pass += 1
            continue
        p_dict = context_dict[paragraph_id]
        for key in p_dict:
            new_question = {}
            new_question['question'] = question['question']
            new_question['id'] = question['id'] + "_" + key
            new_question['context'] = p_dict[key]
            new_question['answers'] = {'answer_start':[], 'text':[]}
            new_dataset.append(new_question)
    print("Skip {} questions".format(count_pass))

    # Save the result
    to_save = {'data': new_dataset}
    json_object = json.dumps(to_save, indent=4)
    with open(save_path, "w") as outfile:
        outfile.write(json_object)
    return

In [None]:
# Match the biased dev set
decontextualization_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/full_decontextualization.json"
original_data_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/Shortcut/dev/answer_position.json"
paragraph_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/paragraph-dev-v1.1.json"
save_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/Shortcut/dev/decontextualized_position.json"
match_question_sentences(decontextualization_path, original_data_path, paragraph_path, save_path)

In [None]:
# Match the anti-biased dev set
decontextualization_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/full_decontextualization.json"
original_data_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/Shortcut/dev/anti_answer_position.json"
paragraph_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/paragraph-dev-v1.1.json"
save_path = "/Volumes/Share/tran_s2/AgainstShortcut/debias_experiment/Data/Shortcut/dev/decontextualized_anti_position.json"
match_question_sentences(decontextualization_path, original_data_path, paragraph_path, save_path)