In [1]:
import os
import json
import jsonlines
import numpy as np
from datasets import Dataset

In [2]:
root_path = '/home/ye/CQR/datasets/qrecc'

In [3]:
train_lines = json.load(open(os.path.join(root_path, 'train-sampled10k_fused_ICL_editor_post.json'), "r", encoding="utf-8"))

In [4]:
len(train_lines)

10000

In [5]:
dev_lines = json.load(open(os.path.join(root_path, 'dev-sampled2k_fused_ICL_editor_post.json'), "r", encoding="utf-8"))

In [6]:
len(dev_lines)

2000

In [7]:
test_lines = json.load(open(os.path.join(root_path, 'test_fused_ICL_editor_post.json'), "r", encoding="utf-8"))

In [8]:
len(test_lines)

8209

In [9]:
test_lines[1]

{'Answer_URL': 'https://www.thepalife.com/how-much-does-it-cost-to-go-to-physician-assistant-pa-school/',
 'Conversation_no': 1,
 'Conversation_source': 'trec',
 'Question': 'What does it cost?',
 'Truth_answer': 'Average Cost Across all PA Schools for the 2017 Application Cycle is as Follows: Average cost of resident tuition for a 27-month physician assistant program is: $71,369. Average cost of non-resident tuition for a 27-month physician assistant program is: $89,975.',
 'Truth_passages': ['http://web.archive.org/web/20200810001136id_/https://www.thepalife.com/how-much-does-it-cost-to-go-to-physician-assistant-pa-school/_p2'],
 'Truth_rewrite': "What does it cost to become a physician's assistant?",
 'Turn_no': 3,
 'NewContext': ["What is a physician's assistant?",
  'physician assistants are medical providers who are licensed to diagnose and treat illness and disease and to prescribe medication for patients',
  'What are the educational requirements required to become one?',
  "Co

In [10]:
train_lines[1].keys()

dict_keys(['Answer_URL', 'Conversation_no', 'Conversation_source', 'Question', 'Truth_answer', 'Truth_passages', 'Truth_rewrite', 'Turn_no', 'NewContext', 'GPT_rewrite', 'Editor_rewrite'])

In [11]:
def convert_to_source_target_pairs(lines, rewrite_type):
    data = {'conv_id': [], 'source': [], 'target': []}
    for line in lines:
        conv_id = f"{line['Conversation_source']}_{line['Conversation_no']}_{line['Turn_no']}"
        source = None
        for idx, uttr in enumerate(line['NewContext']):
            if idx == 0:
                source = "<Que> " + uttr
            elif idx % 2 == 0:
                source = source + " <Que> " + uttr
            else:
                source = source + " <Ans> " + uttr
        if source is None:
            source = "<Que> " + line['Question']
        else:
            source = source + " <Que> " + line['Question']
        data['conv_id'].append(conv_id)
        data['source'].append(source)
        if "mix" in rewrite_type:
            data['target'].append(line['Truth_rewrite'])
        else:
            data['target'].append(line[rewrite_type])
    if "mix" in rewrite_type:
        for line in lines:
            conv_id = f"{line['Conversation_source']}2_{line['Conversation_no']}_{line['Turn_no']}"
            source = None
            for idx, uttr in enumerate(line['NewContext']):
                if idx == 0:
                    source = "<Que> " + uttr
                elif idx % 2 == 0:
                    source = source + " <Que> " + uttr
                else:
                    source = source + " <Ans> " + uttr
            if source is None:
                source = "<Que> " + line['Question']
            else:
                source = source + " <Que> " + line['Question']
            data['conv_id'].append(conv_id)
            data['source'].append(source)
            data['target'].append(line['GPT_rewrite'])
    return data

In [12]:
rewrite_type = 'Editor_rewrite'
train_dataset = Dataset.from_dict(convert_to_source_target_pairs(train_lines, rewrite_type))
dev_dataset = Dataset.from_dict(convert_to_source_target_pairs(dev_lines, rewrite_type))
test_dataset = Dataset.from_dict(convert_to_source_target_pairs(test_lines, rewrite_type))

In [13]:
save_folder = '/home/ye/CQR/T5QR/datasets/qrecc_subset'
train_dataset.save_to_disk(f"{save_folder}/{rewrite_type}/train")
dev_dataset.save_to_disk(f"{save_folder}/{rewrite_type}/dev")
test_dataset.save_to_disk(f"{save_folder}/{rewrite_type}/test")

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8209 [00:00<?, ? examples/s]

In [14]:
dev_dataset[2]

{'conv_id': 'quac_343_5',
 'source': "<Que> When was Fleet Foxes' second album released? <Ans> Helplessness Blues is the second studio album by American indie folk band Fleet Foxes, released on May 3, 2011. <Que> Was it a hit? <Ans> Helplessness Blues was nominated as Best Folk Album at the 2012 Grammy Awards, held February 12, 2012. <Que> Were there any singles from the album? <Ans> The title track, Helplessness Blues was released via free download on January 31, 2011, and the album's fourth track, Battery Kinzie premiered on Zane Lowe's show on March 22, 2011. <Que> Who replaced Tillman? <Ans>  <Que> Are there any other interesting aspects about this article?",
 'target': 'Who replaced Tillman in Fleet Foxes, as mentioned in this article?'}

In [15]:
test_dataset[24]

{'conv_id': 'trec_4_3',
 'source': '<Que> What was the neolithic revolution? <Ans> The Neolithic Revolution, also called the Agricultural Revolution, marked the transition in human history from small, nomadic bands of hunter-gatherers to larger, agricultural settlements and early civilization. <Que> When did it start and end? <Ans> The Neolithic Revolution—also referred to as the Agricultural Revolution—is thought to have begun about 12,000 years ago. <Que> Why did it start?',
 'target': 'What was the reason for the start of the Neolithic Revolution, which marked the transition in human history from small, nomadic bands of hunter-gatherers to larger, agricultural settlements and early civilization?'}

In [16]:
# dev_dataset[1+11573]

In [17]:
len(train_dataset), len(test_dataset), len(dev_dataset)

(10000, 8209, 2000)

In [18]:
train_dataset

Dataset({
    features: ['conv_id', 'source', 'target'],
    num_rows: 10000
})