In [26]:
# load tydi json and compute stats on the questions and answers
# this works for any data in Kilt-ELI5/LongNQ format
import glob
import gzip
import json
import spacy

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fc710d60880>

In [8]:

def load_json_from_file(gt_file_patterns, count=-1):
    data = []
    if gt_file_patterns.endswith('gz'):
        f = gzip.open(gt_file_patterns, 'rt', encoding='utf-8')
    else:
        f = open(gt_file_patterns, 'rt', encoding='utf-8')
    lines = f.readlines()
    for line in lines[0:count]:
        data.append(json.loads(line))
    return data


In [28]:
from random import sample
import tqdm

def nlp_count(text):
    tokens = [[token.text for token in sent] for sent in nlp(text, disable=['parser', 'tagger', 'ner']).sents]

    token_count = 0

    for sentence in tokens:
        for word in sentence:
            token_count += 1
    return token_count, len(tokens)


def compute_stats(data):

    stats = {}
    stats['q_words'] = 0
    stats['p_words'] = 0
    stats['a_words'] = 0
    stats['a_sentences'] = 0
    stats['a_per_q'] = 0
    stats['s_per_p'] = 0
    stats['passages'] = 0
    stats['unanswerable'] = 0
    stats['first_word'] = {}

    for i in tqdm.tqdm(range(len(data))):
                       
        example = data[i]
        
        example_id = example['id']
        question = example["input"]

        q_word = question.split()[0]

        annotations = example['output']

        if annotations == None:
            continue

        '''
        # words in question
        # words in passage
        # words in answer
        # sentences in answer
        # of answers per q
        # of passages
        '''
        if q_word in stats['first_word']:
            stats['first_word'][q_word]+= 1
        else:
            stats['first_word'][q_word] = 1

        token_count, _ = nlp_count(question)
        stats['q_words'] += token_count

        # if 'passages' in example:
        #     for passage in example['passages']:
        #         token_count, sentence_count = nlp_count(passage['title'] + " " + passage['text'])
        #         stats['p_words'] += token_count
        #         stats['s_per_p'] += sentence_count
        #         stats['passages'] += 1

        for answer in example['output']:
            # unanswerable
            if answer['answer'] == None or answer['answer'] == "":
                stats['unanswerable'] += 1
                continue
            token_count, sentence_count = nlp_count(answer['answer'])
            stats['a_words'] += token_count
            stats['a_sentences'] += sentence_count
            stats['a_per_q'] += 1

    # print(stats)
    print(f"Queries\t{len(data)}")
    print(f"A per Q\t{stats['a_per_q']/len(data)}")
    print(f"WORDS in Q\t{stats['q_words']/len(data)}")
    print(f"WORDS in A\t{stats['a_words']/stats['a_per_q']}")
    print(f"SENTENCES in A\t{stats['a_sentences']/stats['a_per_q']}")
    if stats['passages'] > 0:
        print(f"WORDS in P\t{stats['p_words']/stats['passages']}")
        print(f"S per P\t{stats['s_per_p']/stats['passages']}")
    print(f"Unanswerable\t{stats['unanswerable']}")

    aggegrated_stats = [len(data),stats['a_per_q']/len(data),stats['q_words']/len(data),stats['a_sentences']/stats['a_per_q'],stats['unanswerable']]

    for word in stats['first_word']:
        if stats['first_word'][word] > 10:
            print(f"{word}\t{stats['first_word'][word]}")

    return aggegrated_stats


In [9]:
def load_data(data_dir, count=-1):

    files = glob.glob(data_dir)
    data = []
    for file_n in files:
        data.extend(load_json_from_file(file_n, count))
    return data

In [18]:
train_data_dir = "/dccstor/srosent2/generative/appen/final/longNQ/train/*jsonl"
dev_data_dir = "/dccstor/srosent2/generative/appen/final/longNQ/dev/*jsonl"
test_data_dir = "/dccstor/srosent2/generative/appen/final/longNQ/test/*jsonl"

# print('train')
# compute_stats(load_data(train_data_dir))
print('dev')
compute_stats(load_data(dev_data_dir)[:40])
# print('test')
# compute_stats(load_data(test_data_dir))


dev
Queries	40
A per Q	1.375
WORDS in Q	9.35
WORDS in A	41.872727272727275
SENTENCES in A	1.6545454545454545
WORDS in P	155.875
S per P	6.125
Unanswerable	0
what	11


In [19]:
train_data_dir = "/dccstor/srosent2/primeqa-mengxia/data/asqa/formatted/ASQA_train.json"
dev_data_dir = "/dccstor/srosent2/primeqa-mengxia/data/asqa/formatted/ASQA_dev.json"
test_data_dir = "/dccstor/srosent2/primeqa-mengxia/data/asqa/formatted/ASQA_test.json"

# print('train')
# compute_stats(load_data(train_data_dir))
print('dev')
compute_stats(load_data(dev_data_dir)[:40])
# print('test')
# compute_stats(load_data(test_data_dir))

dev
Queries	40
A per Q	2.0
WORDS in Q	9.625
WORDS in A	70.575
SENTENCES in A	2.9625
Unanswerable	0
Who	20


In [20]:
train_data_dir = "/dccstor/srosent2/primeqa-mengxia/data/dpr-100passages_withkg_best_all/eli5-train*"
dev_data_dir = "/dccstor/srosent2/primeqa-mengxia/data/dpr-100passages_withkg_best_all/eli5-dev*"

# print('train')
# compute_stats(load_data(train_data_dir))
print('dev')
compute_stats(load_data(dev_data_dir)[:40])
# print('test')
# compute_stats(load_data(test_data_dir))

dev
Queries	40
A per Q	13.0
WORDS in Q	16.675
WORDS in A	105.28076923076924
SENTENCES in A	5.476923076923077
WORDS in P	107.0445
S per P	4.443
Unanswerable	0
Why	13


In [3]:
# AquaMuse
from datasets import load_dataset
import pandas as pd
import tqdm 

json_data = []

for data_type in ['abstractive', 'extractive']:
    dataset = load_dataset("aquamuse",data_type)
    for split, dataset in dataset.items():

        for i in tqdm.tqdm(range(len(dataset))):
            json_example = {}
            # dataset conversion
            # query, input_urls, target
            json_example['input'] = dataset['query'][i]
            json_example['id'] = i
            json_example['output'] = [{'answer':dataset['target'][i], 'meta':{'urls':dataset['input_urls'][i]}}]
            json_data.append(json_example)

        print(f"dump {split} {data_type}")
        pd.DataFrame.from_dict(json_data).to_json(f"/dccstor/srosent2/generative/external_datasets/aquamuse/{data_type}/{split}.jsonl", orient='records', lines=True)

2023-11-01 22:42:17.971421: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-01 22:42:20.979020: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-01 22:42:20.979098: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-01 22:42:21.029843: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-01 22:42:22.605370: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
100%|██████████| 6253/6253 [08:48<00:00, 11.83it/s]


dump train abstractive


100%|██████████| 811/811 [00:08<00:00, 95.69it/s]


dump test abstractive


100%|██████████| 661/661 [00:05<00:00, 117.82it/s]


dump validation abstractive


100%|██████████| 6253/6253 [08:50<00:00, 11.80it/s]


dump train extractive


100%|██████████| 811/811 [00:08<00:00, 96.86it/s]


dump test extractive


100%|██████████| 661/661 [00:05<00:00, 117.78it/s]


dump validation extractive


In [6]:
train_data_dir = "/dccstor/srosent2/generative/external_datasets/aquamuse/abstractive/train.jsonl"
dev_data_dir = "/dccstor/srosent2/generative/external_datasets/aquamuse/abstractive/validation.jsonl"

# print('train')
# compute_stats(load_data(train_data_dir))
print('dev')
compute_stats(load_data(dev_data_dir)[:40])
# print('test')
# compute_stats(load_data(test_data_dir))

dev


NameError: name 'load_data' is not defined

In [2]:
# Truthful QA
from datasets import load_dataset
import pandas as pd
import tqdm 

json_data = []


dataset = load_dataset("truthful_qa",'generation')
for split, dataset in dataset.items():

    for i in tqdm.tqdm(range(len(dataset))):
        # dataset conversion
        # type, question, correct_answers, source
        json_example = {}
        json_example['input'] = dataset['question'][i]
        json_example['id'] = i
        json_example['output'] = []
        for answer in dataset['correct_answers'][i]:
            json_example['output'].append({'answer': answer, 'meta':{'urls':[dataset['source'][i]]}})
        json_data.append(json_example)

    print(f"dump {split}")
    pd.DataFrame.from_dict(json_data).to_json(f"/dccstor/srosent2/generative/external_datasets/truthful_qa/generation/{split}.jsonl", orient='records', lines=True)

100%|██████████| 817/817 [00:07<00:00, 106.93it/s]

dump validation





In [None]:
train_data_dir = "/dccstor/srosent2/generative/external_datasets/aquamuse/abstractive/train.jsonl"
dev_data_dir = "/dccstor/srosent2/generative/external_datasets/aquamuse/abstractive/validation.jsonl"

# print('train')
# compute_stats(load_data(train_data_dir))
print('dev')
compute_stats(load_data(dev_data_dir)[:40])
# print('test')
# compute_stats(load_data(test_data_dir))

In [7]:
# expertQA
import glob
import pandas as pd

data_dir = glob.glob("/dccstor/srosent2/generative/external_datasets/expertqa/original_format/domain_lfqa*")

for data_split in data_dir:
    json_data = []

    split = data_split[len("/dccstor/srosent2/generative/external_datasets/expertqa/original_format/domain_lfqa_"):-5]

    data = pd.read_json(data_split, lines=True, orient='records')    

    for i, row in data.iterrows():
        json_example = {}
        json_example['id'] = row['example_id']
        json_example['input'] = row['question']
        json_example['output'] = [{'answer': row['answer'], 'meta':{}}]
        json_data.append(json_example)
    print(f"dump {split}")
    pd.DataFrame.from_dict(json_data).to_json(f"/dccstor/srosent2/generative/external_datasets/expertqa/{split}.jsonl", orient='records', lines=True)

dump train
dump test
dump val


In [1]:
# wikihowqa

import glob
import pandas as pd
import tqdm

corpus = pd.read_csv("/dccstor/srosent2/generative/external_datasets/wikihowQA/original_format/summary.txt", names=['id','summary','answer'],delimiter='\t')   

data_dir = glob.glob("/dccstor/srosent2/generative/external_datasets/wikihowQA/original_format/*.txt")

for data_split in data_dir:
    
    json_data = []

    split = data_split[len("/dccstor/srosent2/generative/external_datasets/wikihowQA/original_format/"):-4]

    if split == "summary": 
        continue

    data = pd.read_csv(data_split, delimiter='\t', names=['input','pid','label'])   

    for i in tqdm.tqdm(range(len(data))):
        row = data.iloc[i]
        json_example = {}
        json_example['id'] = i
        json_example['input'] = row['input']
        answers = corpus[corpus['id'] == row['pid']]

        if len(answers) > 1:
            print("multiple answers")
       
        json_example['passages'] = [{'title':"",'text':answers.iloc[0]['summary'],'sentences':""}]
        json_example['output'] = [{'answer': answers.iloc[0]['answer'], 'meta':{'docid':answers.iloc[0]['id']}}]
        json_data.append(json_example)
    print(f"dump {split}")
    pd.DataFrame.from_dict(json_data).to_json(f"/dccstor/srosent2/generative/external_datasets/wikihowQA/{split}.jsonl", orient='records', lines=True)


100%|██████████| 904460/904460 [09:44<00:00, 1547.26it/s]


dump train


100%|██████████| 211255/211255 [02:35<00:00, 1360.93it/s]


dump test


100%|██████████| 72474/72474 [00:52<00:00, 1387.13it/s]


dump valid


In [29]:
import glob

data_dir = "/dccstor/srosent2/generative/external_datasets/**/*.jsonl"

all_data_files = glob.glob(data_dir, recursive=True)
all_data_files.extend(glob.glob("/dccstor/srosent2/generative/appen/final/longNQ/*/*.jsonl"))
all_data_files.extend(glob.glob("/dccstor/srosent2/primeqa-mengxia/data/asqa/formatted/ASQA_*.json"))
all_data_files.extend(glob.glob("/dccstor/srosent2/primeqa-mengxia/data/dpr-100passages_withkg_best_all/eli5-*"))
all_stats = {}

for data_file in all_data_files:
    if "original" in data_file:
        continue
    print(data_file)
    all_stats[data_file[len("/dccstor/srosent2/generative/external_datasets/"):-6]] = compute_stats(load_data(data_file))

print("Queries,A per Q,WORDS in Q,WORDS in A,SENTENCES in A, UNANSWERABLE")
for stat in all_stats:
    s=[str(i) for i in all_stats[stat]]
    stats_as_string = ','.join(s)
    print(f"{stat},{stats_as_string}")

/dccstor/srosent2/generative/external_datasets/wikihowQA/test.jsonl


  8%|▊         | 16067/211254 [02:26<29:37, 109.81it/s]


KeyboardInterrupt: 