In [1]:
import torch
import datasets
import json
from pathlib import Path


In [12]:
data_dir = Path('/workspace/data/language_domain_verbosity')
datasets = {}
files_to_load = {
    'en_disease_train': 'wikisection_en_disease_train.json',
    'en_disease_test': 'wikisection_en_disease_test.json', 
    'en_city_test': 'wikisection_en_city_test.json',
    'de_city_train': 'wikisection_de_city_train.json',
    'de_city_test': 'wikisection_de_city_test.json',
    'de_disease_test': 'wikisection_de_disease_test.json'
}

for name, filename in files_to_load.items():
    with open(data_dir / filename, 'r') as f:
        datasets[name] = json.load(f)
        print(f"Loaded {name}: {len(datasets[name])} samples")


Loaded en_disease_train: 2513 samples
Loaded en_disease_test: 718 samples
Loaded en_city_test: 3907 samples
Loaded de_city_train: 8777 samples
Loaded de_city_test: 2507 samples
Loaded de_disease_test: 464 samples


In [8]:
print(datasets["en_disease_train"][0].keys())
print(datasets["en_disease_train"][0])

dict_keys(['id', 'type', 'title', 'abstract', 'text', 'annotations'])
{'id': 'https://en.wikipedia.org/wiki/Pneumonic_plague', 'type': 'disease', 'title': 'Pneumonic plague', 'abstract': 'Pneumonic plague is a severe lung infection caused by the bacterium Yersinia pestis. Symptoms include fever, headache, shortness of breath, chest pain, and cough. They typically start about three to seven days after exposure. It is one of three forms of plague, the other two being septicemic plague and bubonic plague.\nThe pneumonic form may occur following an initial bubonic or septicemic plague infection. It may also result from breathing in airborne droplets from another person or cat infected with pneumonic plague. The difference between the forms of plague is the location of infection; in pneumonic plague the infection is in the lungs, in bubonic plague the lymph nodes, and in septicemic plague within the blood. Diagnosis is by testing the blood, sputum, or fluid from a lymph node.\nWhile vaccine

In [10]:
context_length = 100
en_length = 100
de_length = 20

data_out = Path("../data/language_domain_verbosity").expanduser()
# Create a .jsonl file with 2.5k samples each from both training datasets.
# The keys for the file should be input, language, and domain.
# If the sample is from en_disease_train, the input length should be 20 characters.
# if the sample is from de_city_train, the input length should be 100 characters.

def format_sample(sample: dict, tokens: int, language: str, domain: str):
    return {
        "input": sample["text"][:tokens],
        "language": language,
        "domain": domain,
        "full_context": sample["text"][:300]
    }


train_data = []

for sample in datasets["en_disease_train"][:2500]:
    train_data.append(format_sample(sample, context_length + en_length, "en", "disease"))

for sample in datasets["de_city_train"][:2500]:
    train_data.append(format_sample(sample, context_length + de_length, "de", "city"))



print(data_out)

with open(data_out / "data-train.jsonl", "w") as f:
    for sample in train_data:
        f.write(json.dumps(sample) + "\n")
        



../data/language_domain_verbosity


In [13]:
test_data = []

for sample in datasets["en_disease_test"][:464]:
    test_data.append(format_sample(sample, context_length, "en", "disease"))

for sample in datasets["de_city_test"][:464]:
    test_data.append(format_sample(sample, context_length, "de", "city"))

for sample in datasets["en_city_test"][:464]:
    test_data.append(format_sample(sample, context_length, "en", "city"))

for sample in datasets["de_disease_test"][:464]:
    test_data.append(format_sample(sample, context_length, "de", "disease"))

with open(data_out / "data-test.jsonl", "w") as f:
    for sample in test_data:
        f.write(json.dumps(sample) + "\n")
