In [2]:
%cd ../

/home/qwj/code/HDInstruct


In [3]:
import os
os.environ["HF_HOME"] = "/home/qwj/hfcache"
os.environ['DATASETS_OFFLINE'] = '1'

In [4]:
import json
import datasets

Six datasets are used in this notebook

- CoQA Train Split
- Squad v2.0 Train Split
- TriviaQA Train Split
- NQ Train Split
- LAMA Trex
- InternalStates TrueOrFalse

## CoQA

In [8]:
_URL = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
assert os.path.exists("./data/raw/coqa-train-v1.0.json"), f"Download from {_URL} and save to ./data/raw/"

def generate_coqa_examples(filepath = "./data/raw/coqa-train-v1.0.json"):
    data = json.load(open(filepath))
    for story in data['data']:
        story_id = story['id']
        context = story['story']
        for i, (q, a) in enumerate(zip(story['questions'], story['answers'])):
            # train split does not have additional answers
            # additional_answers = [story['additional_answers'][str(j)][i]['input_text'] for j in range(3)]
            answers = list(set([a['input_text']]))
            yield  {
                "id": f"coqa_{story_id}_{i}",
                "context": context,
                "question": q['input_text'],
                "ground_truth": answers,
            }

In [9]:
processed_data = list(generate_coqa_examples())
len(processed_data), processed_data[0]

(108647,
 {'id': 'coqa_3zotghdk5ibi9cex97fepx7jetpso7_0',
  'context': 'The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula. \n\nThe Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail. \n\nIn March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to be made available online. \n\nThe Vatica

In [11]:
data_with_config = {
    "raw_path": "./data/raw/coqa-train-v1.0.json",
    "num_examples": len(processed_data),
    "columns": ["id", "context", "question", "ground_truth"],
    "data": processed_data,
}

# save to file
with open("./data/processed_train/coqa_train.json", "w") as f:
    json.dump(data_with_config, f, indent=4)

## SQUAD

In [15]:
_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
assert os.path.exists("./data/raw/squad-train-v2.0.json"), f"Download from {_URL} and save to ./data/raw/"

def generate_squad_examples(filepath = "./data/raw/squad-train-v2.0.json"):
    data = json.load(open(filepath))
    for article in data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if qa['is_impossible']:
                    continue
                question = qa['question']
                answers = set([a['text'] for a in qa['answers']])
                yield {
                    "id": f"squad_{qa['id']}",
                    "context": context,
                    "question": question,
                    "ground_truth": list(answers),
                }

In [16]:
processed_data = list(generate_squad_examples())
print(f"Number of examples: {len(processed_data)}")
print(processed_data[0])

Number of examples: 86821
{'id': 'squad_56be85543aeaaa14008c9063', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'ground_truth': ['in the late 1990s']}


In [17]:
data_with_config = {
    "raw_path": "./data/raw/squad-train-v2.0.json",
    "num_examples": len(processed_data),
    "columns": ["id", "context", "question", "ground_truth"],
    "data": processed_data,
}

with open("./data/processed_train/squad_train.json", "w") as f:
    json.dump(data_with_config, f, indent=4)

## TriviaQA

In [20]:
triviaqa_ds = datasets.load_dataset("trivia_qa", "rc.nocontext", split='train', cache_dir="/home/qwj/hfcache/datasets")
triviaqa_ds

Using the latest cached version of the dataset since trivia_qa couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'rc.nocontext' at /home/qwj/hfcache/datasets/trivia_qa/rc.nocontext/0.0.0/0f7faf33a3908546c6fd5b73a660e0f8ff173c2f (last modified on Wed Mar  6 12:47:32 2024).


Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 138384
})

### check duplications


In [21]:
from collections import defaultdict

def generate_triviaqa_examples(triviaqa_ds):
    qid_record = defaultdict(list)
    for i, example in enumerate(triviaqa_ds):
        qid_record[example['question_id']].append(i)
    for qid, indices in qid_record.items():
        # assert questions are the same
        all_questions, all_answers = set(), set()
        for i in indices:
            all_questions.add(triviaqa_ds[i]['question'])
            all_answers.add(triviaqa_ds[i]['answer']['value'])
            for alias in triviaqa_ds[i]['answer']['aliases']:
                all_answers.add(alias)
        if len(all_questions) != 1:
            print(f"question {qid}, {indices} has multiple questions:\n",'\n'.join(all_questions))
            continue
        yield {
            "id": f"triviaqa_{qid}",
            "question": list(all_questions),
            "ground_truth": list(all_answers),
        }

shit = list(generate_triviaqa_examples(triviaqa_ds))

so it does not have conflicting question ids

In [22]:
# save 
data_with_config = {
    "raw_path": "datasets.load_dataset(\"trivia_qa\", \"rc.nocontex\", split=\"train\")",
    "num_examples": len(shit),
    "columns": ["id", "question", "ground_truth"],
    "data": shit,
}

with open("./data/processed_train/triviaqa_train.json", "w") as f:
    json.dump(data_with_config, f, indent=4)

## NQ

In [5]:
nq_ds = datasets.load_dataset("nq_open", split='train', cache_dir="/home/qwj/hfcache/datasets")
nq_ds

Using the latest cached version of the dataset since nq_open couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'nq_open' at /home/qwj/hfcache/datasets/nq_open/nq_open/0.0.0/3e24b5c209e8f578bd6f5ee795167a3577674383 (last modified on Fri Mar  1 09:15:18 2024).


Dataset({
    features: ['question', 'answer'],
    num_rows: 87925
})

In [6]:
def generate_nq_examples(nq_ds):
    for i, example in enumerate(nq_ds):
        yield {
            "id": f"nq_{i}",
            "question": example['question'],
            "ground_truth": example['answer'],
        }

processed_data = list(generate_nq_examples(nq_ds))

In [7]:
# save 
data_with_config = {
    "raw_path": "datasets.load_dataset(\"nq_open\", split=\"train\")",
    "num_examples": len(processed_data),
    "columns": ["id", "question", "ground_truth"],
    "data": processed_data,
}

with open("./data/processed_train/nq_train.json", "w") as f:
    json.dump(data_with_config, f, indent=4)

## LAMA Trex

In [5]:
lama_ds = datasets.load_dataset('lama', 'trex')
lama_ds

Using the latest cached version of the module from /home/qwj/hfcache/modules/datasets_modules/datasets/lama/430016dd70224564ad385a96e0e4a3f88aeb5beaf4e34a8cf65b390fbc83aed7 (last modified on Mon Mar 11 08:27:49 2024) since it couldn't be found locally at lama, or remotely on the Hugging Face Hub.


DatasetDict({
    train: Dataset({
        features: ['uuid', 'obj_uri', 'obj_label', 'sub_uri', 'sub_label', 'predicate_id', 'sub_surface', 'obj_surface', 'masked_sentence', 'template', 'template_negated', 'label', 'description', 'type'],
        num_rows: 1304391
    })
})

In [6]:
df = lama_ds['train'].select_columns(['obj_label', 'sub_label', 'predicate_id', 'template']).to_pandas()
df.head()

Unnamed: 0,obj_label,sub_label,predicate_id,template
0,Northamptonshire,A605 road,P131,[X] is located in [Y] .
1,Northamptonshire,A605 road,P131,[X] is located in [Y] .
2,Northamptonshire,A605 road,P131,[X] is located in [Y] .
3,Northamptonshire,A605 road,P131,[X] is located in [Y] .
4,Northamptonshire,A605 road,P131,[X] is located in [Y] .


In [7]:
df = df.drop_duplicates()
len(df)

34017

### 50%正确，50%拿同一谓词的其他内容代替

In [8]:
import pandas as pd
import random
random.seed(0)

In [9]:
all_predict_ids = df['predicate_id'].unique()
all_obj_labels = {
    key: list(set(df[df['predicate_id'] == key]['obj_label']))
    for key in all_predict_ids
}

In [10]:
statements = []
labels = []
categories = []

for index, row in df.iterrows():
    random_number = random.randint(0, 1)
    if random_number == 0:
        label = 0
        while True:
            obj = random.choice(all_obj_labels[row['predicate_id']])
            if obj != row['obj_label']:
                break
    else:
        obj = row['obj_label']
        label = 1
    statement = row['template'].replace('[X]', row['sub_label']).replace('[Y]', obj)
    statements.append(statement)
    labels.append(label)
    categories.append(row['predicate_id'])

new_df = pd.DataFrame({'statement': statements, 'label': labels, 'category': categories})
new_df.head()

Unnamed: 0,statement,label,category
0,A605 road is located in Northamptonshire .,1,P131
1,Kupreanof Island is located in Alaska .,1,P131
2,Pershing County is located in Pasadena .,0,P131
3,Porcupine Hills is located in Manitoba .,1,P131
4,Minnesota State Highway 36 is located in Minne...,1,P131


In [11]:
data_to_save = {
    "raw_path": "datasets.load_dataset('lama', 'trex')",
    "num_examples": len(new_df),
    "columns": ["statement", "label", "category"],
    "data": new_df.to_dict(orient='records'),
}

with open("./data/processed_train/lama_trex_train.json", "w") as f:
    json.dump(data_to_save, f, indent=4)

## InternalStates TrueOrFalse

In [12]:
file_names = [
    "animals_true_false.csv",
    "companies_true_false.csv",
    "facts_true_false.csv",
    "inventions_true_false.csv",
    "cities_true_false.csv",
    "elements_true_false.csv",
    "generated_true_false.csv"
]

_URL = "azariaa.com/Content/Datasets/true-false-dataset.zip"
for file_name in file_names:
    assert os.path.exists(f"./data/raw/True_or_False/{file_name}"), f"Download from {_URL} and save to ./data/raw/"

In [14]:
def generate_true_false_examples():
    for file_name in file_names:
        df = pd.read_csv(f"./data/raw/True_or_False/{file_name}")
        for index, row in df.iterrows():
            yield {
                "category": file_name,
                "statement": row['statement'],
                "label": row['label'],
            }

len(list(generate_true_false_examples()))

6330

In [15]:
generated_true_false = list(generate_true_false_examples())
data_to_save = {
    "raw_path": "https://azariaa.com/Content/Datasets/true-false-dataset.zip",
    "num_examples": len(generated_true_false),
    "columns": ["category", "statement", "label"],
    "data": generated_true_false,
}

with open("./data/processed_train/internal_states_train.json", "w") as f:
    json.dump(data_to_save, f, indent=4)