In [31]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List
from transformers import AutoTokenizer
from tqdm import tqdm
import scml
from scml import nlp as snlp
import mylib

In [32]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [33]:
pretrained_dir = "pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [34]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19661 entries, 0 to 19660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             19661 non-null  object
 1   pub_title      19661 non-null  object
 2   dataset_title  19661 non-null  object
 3   dataset_label  19661 non-null  object
 4   cleaned_label  19661 non-null  object
dtypes: object(5)
memory usage: 768.1+ KB


In [35]:
gt_map: Dict[str, List[str]] = {}
dl_map: Dict[str, List[str]] = {}
for t in tqdm(train.itertuples()):
    pid = getattr(t, "Id")
    cl = getattr(t, "cleaned_label")
    dl = getattr(t, "dataset_label")
    if pid not in gt_map:
        gt_map[pid] = list()
    if pid not in dl_map:
        dl_map[pid] = list()
    gt_map[pid].append(cl)
    dl_map[pid].append(dl)

19661it [00:00, 330459.60it/s]


In [36]:
rows = []
sep = "|"
m = 650
negative_examples = 3
max_length = 500
for rid, gts in tqdm(gt_map.items()):
    is_multi = 0
    if len(gts) > 1:
        is_multi = 1
    ground_truth = sep.join(sorted(gts))
    dataset_labels = sep.join(sorted(dl_map[rid]))
    with open(f"input/train/{rid}.json") as in_file:
        sections = json.load(in_file)
    tmp = []
    for section in sections:
        tmp.append(section["text"])
    text = " ".join(tmp).strip()
    text = snlp.to_ascii_str(text)
    spans = []
    for dl in dl_map[rid]:
        i = 0
        j = len(text)
        while i < j - len(dl) + 1:
            a = text.find(dl, i, j)
            if a == -1:
                break
            b = a + len(dl)
            spans.append((a, b))
            i = b
            # Right context: answer is on the left edge of window
            b = 2 * m + a
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                b -= 100
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["answer_start"] = 0
            row["answer_end"] = len(dl)
            row["context"] = text[a:b]
            row["context_token_length"] = _len
            rows.append(row)

  2%|▏         | 317/14316 [00:00<00:38, 360.72it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 14316/14316 [00:51<00:00, 279.08it/s]


In [37]:
df = pd.DataFrame.from_records(rows)
#df = pd.DataFrame(tmp, columns=["Id", "is_multi", "ground_truth", "dataset_labels"])
cols = ["is_multi", "is_impossible"]
df[cols] = df[cols].astype(np.int8)
cols = ["answer_start", "answer_end", "context_token_length"]
df[cols] = df[cols].astype(np.int8) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61002 entries, 0 to 61001
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Id                    61002 non-null  object
 1   is_multi              61002 non-null  int8  
 2   ground_truth          61002 non-null  object
 3   dataset_labels        61002 non-null  object
 4   is_impossible         61002 non-null  int8  
 5   answer_start          61002 non-null  int8  
 6   answer_end            61002 non-null  int8  
 7   context               61002 non-null  object
 8   context_token_length  61002 non-null  int8  
dtypes: int8(5), object(4)
memory usage: 2.2+ MB


In [38]:
df.sample(20).head(20)

Unnamed: 0,Id,is_multi,ground_truth,dataset_labels,is_impossible,answer_start,answer_end,context,context_token_length
27897,50360796-e02b-49e1-8de8-4f9c01f715f2,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI subjects (mean ages 80 years 11 months vs. 75 years 5 months). Fourth, the ADNI MCI sample was restricted to the AD prodrome, a-MCI, and the AD sample was restricted to probable AD. The Wechsler logical memory test was used to distinguish a-MCI subjects from other MCI subjects during initial testing: Subjects were required to have scores in the 2-8 range on the delayed part of this test (the maximum score is 25) to be included in the MCI sample. As a result, although ADAMS MCIs were a mixture of a-MCI and n-MCI, ADNI MCIs were exclusively a-MCI. This is important when it comes to testing theoretical hypotheses about the retrieval processes that differentiate HC, MCI, and AD groups and that predict future HC MCI and MCI AD transitions. Those processes might be different when the MCI group is restricted to subjects with significant memory impairment and when the AD group is restricted to probable AD.\nAs mentioned, ADNI subjects were rediagnosed at regular intervals over the next 2 years. Nearly half the a-MCI subjects converted to AD, and the rest did not (designated as the a-MCI C and a-MCI NC subgroups below). A small proportion (8%) of the HC subjects converted to a-MCI, and the rest did not (designated as the HC C and HC NC subgroups below). Before detailed results are",55
39706,7bba3169-50ae-4ac7-9c5a-b8dba5918149,0,adni,ADNI,0,0,4,"ADNI study to ensure compatibility across scanners [28] . Full brain and skull coverage was required and detailed quality control was carried out on all MR images according to the AddNeuroMed quality control procedure [23, 29] . We applied the Freesurfer pipeline (version 4.5.0) to the MRI images to produce regional cortical thickness and subcortical volumetric measures. Cortical reconstruction and subcortical volumetric segmentation includes removal of non-brain tissue using a hybrid watershed/surface deformation procedure [30] , automated Talairach transformation, segmentation of the subcortical white matter and deep grey matter volumetric structures (including hippocampus, amygdala, caudate, putamen, ventricles) [30] [31] [32] , intensity normalization [33] , tessellation of the grey matter white matter boundary, automated topology correction [34, 35] , and surface deformation following intensity gradients to optimally place the grey/white and grey/cerebrospinal fluid borders at the location where the greatest shift in intensity defines the transition to the other tissue class [36] [37] [38] . Once the cortical models are complete, registration to a spherical atlas takes place which utilizes individual cortical folding patterns to match cortical geometry across subjects [39] .",28
11450,3f3033e9-4f55-4fd0-ac47-330be5a51791,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI, and also one of the most widely used test of dementia severity in NACC [30] .\nDepression scores (Geriatric Depression Scale, GDS): These are scores from a screening test designed to identify depression symptoms in the elderly. A 15-item GDS bigger or equal to 6 was an exclusion criterion in ADNI. Individuals with history of major depression or bipolar disorder within the past year were also excluded from the study. In NACC, GDS is not recorded for individuals with severe dementia, but it is not used as an exclusion criterion for study entry. It should be noted, however, that GDS is not a required form in NACC, so it may not be recorded for any participant at any visit. Due to the inclusion/exclusion criteria and the reduced number of assessments of the GDS score in the datasets considered, the results and conclusions about the potential effect of depression should be interpreted with extra caution. Individuals were classified in three clinical states: CN, MCI and D. We focused on MCICN, DMCI and DCN as the possible back-transitions during disease development. In ADNI, there were no direct transitions from D back to CN. In NACC, 43 such transitions were observed. The possible transitions between the different states are as shown in the schematic diagram of Fig. 1 .\nFor the a",25
33971,07fde929-348c-4408-bdd7-15627aba6ae7,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI.\nLessons from ADNI relevant to the PD initiative include the need to define the key scientific questions to be answered, standardize data collection protocols, build tools that are sophisticated and matched to the data collected, maintain open communication with the research community about their needs to predict future needs; and anticipate what informatics tools/resources will be needed to address new research questions. Several platforms and networks recently established or in development are positioned to address some of these issues:\n The MRC Dementias Platform UK (DPUK) plans to bring together 22 cohorts from across the UK, including the UK Biobank, integrated into a single informatics platform. DPUK also plans to develop a readiness cohort with baseline imaging data as well as amyloid, genetics discovery, and -omics discovery cohorts. (EMIF) is a five-year IMI with 56 partners in 14 European countries, established to make datasets visible to researchers, to integrate research cohorts for combined analysis, and to enable re-use of medical and other data for research. IMI-EMIF is establishing three broad approaches to data reutilizing and sharing. Firstly, to make data visible and potentially utilizable by researchers, it has established a browser for meta-data or descr",12
56198,165eaf2d-2a65-4b56-974b-6f2a9f42a390,0,early childhood longitudinal study,Early Childhood Longitudinal Study,0,0,34,"Early Childhood Longitudinal Study in the U.S. found that children who were overweight when they entered kindergarten were 4 times more likely to be obese at age 14 than children who entered kindergarten at a healthy weight. 16 Once present, obesity is difficult to treat due to metabolic changes that resist weight loss. 17 Effective primary prevention efforts are needed early in life before physiologic barriers to weight loss take hold and less healthful weight-related behaviours become entrenched. The current study examines the efficacy of a primary obesity prevention intervention that embeds weight-related messages within a parenting program that has been shown to improve parenting behaviors among racial/ethnic minority families. 18 A formative pre-post uncontrolled trial with 16 families showed that we could feasibly implement the Parents and Tots Together intervention within a community setting and the intervention was acceptable to families. 6 The purpose of the current study was to assess the extent to which the Parents and Tots Together intervention, compared with controls who received weekly mailings, resulted in a smaller increase in BMI (primary outcome) and improvements in children's weight-related behaviors and parental feeding and general parenting behaviors among ra",-14
38055,3ba42bd7-4106-4470-b606-385f12b7566d,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI data set to support currently hypothetical models, but further to highlight uncertainty in those orderings and variation among different subgroups. We also demonstrate that such a model can provide a practical and effective staging system for patient prognosis.",49
27323,50b0c82c-2a54-40be-990c-2a6cb033ba6c,0,adni,ADNI,0,0,4,"ADNI was not designed to investigate the relationship between vascular disease and risk factors and cognitive decline, the variables making up the Vascular Index score were recorded as dichotomous values (present/ absent) and severity, duration, and timing of vascular risk factors were not accounted for. Therefore, there may have been underreporting or nonspecific reporting of vascular risk factors, which may have led to an underestimation of the effects of vascular risk factors on cognitive decline. The sample predominantly consisted of subjects with MCI, and therefore it is possible that this diagnostic group drove the results. However, the various models included interaction terms of vascular disease and risk factors with diagnostic group and those were not significant suggesting that diagnostic group did not modify the relationship between vascular disease and risk factors and cognitive impairment. Finally, due to our large sample, certain statistically significant results were not necessarily clinically relevant. Therefore, various designations of effect size were provided to better interpret the results.\nIn conclusion, these results suggest that there is a significant association between increased vascular disease and risk factors and cognitive impairment in the AD spectrum",-31
38467,b2769b41-6264-4457-acde-90aa25fea067,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI2/GO dataset (using T1w and FLAIR scans). Not using the T2w/PD information did not change the performance of the Random Forest classifier (SI=0.66 0.17, ICC=0.99). However, not using FLAIR information in the ADC dataset significantly decreased the Dice Kappa, but the volumetric correlation did not drastically change (SI=0.47 0.21, ICC=0.95). White matter hyperintensities (WMHs), commonly identified as areas of increased signal in relation with the surrounding white matter regions on T2w, PD and FLAIR MRIs, are one of the non-specific yet typical and constant MRI expressions of cerebral small vessel disease (CSVD), along with lacunar infarcts and microhemorrhages (Conklin et al., 2014; Gouw et al., 2010) . They have been shown to be more extensive in patients with Alzheimer's disease compared to agematched healthy normal populations (Yoshita et al., 2005) . WMHs reflect ischemic injury in the elderly and AD populations and the existence and severity of WMHs can lead to or accelerate decline in cognitive as well as executive functions (Dubois et al., 2014) . As a result, the location and load of WMHs are important clinical measures, raising substantial need for their accurate quantifications. WMHs are generally detected using fluid attenuated inversion recovery (FLAIR) or T2w",65
31267,7e875ded-00ec-4570-8c5d-c39478bdf85d,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI1, ADNIGO, ADNI2, and AIBL datasets were downloaded from Alzheimer's disease Neuroimaging Initiative (ADNI) database (http://adni.loni.usc.edu/). The Australian Imaging, Biomarker & Lifestyle (AIBL) Flagship Study of Ageing is a collaborative study that shares many common goals with ADNI (http://adni.loni.usc.edu/study-design/collaborative-studies/ aibl/). Only ADNI-compliant subjects with at least three timepoints from AIBL were included in this study. For all ADNI cohorts, subjects with clinical assessments from at least three visits, in timespan longer than one year, were included in the analysis. Subjects were further excluded based on manual quality checks after MR preprocessing pipelines (described below). Age, Apolipoprotein E4 (APOE4) status, clinical scores from mini-mental state exam (MMSE) and Alzheimer's Disease Assessment Scale (ADAS-13), and T1-weighted MR images were used in the analysis. Subject demographics are shown in Table 1 , and the complete list of included subjects is provided in S1 File.\nThe ADNI sample comprising pooled ADNI1, ADNIGO, and ADNI2 subjects was used to perform primary analysis comprising trajectory modeling and prediction tasks, whereas AIBL subjects were used as the independent replication cohort for the prediction task. There are a few",64
32199,54a8145c-6503-4fa7-9e77-ead6321bdf7d,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI were found to meet the above criteria and are thus included in the P-MCI group. For the other three clinical groups, subjects are selected to match the demographic information in the P-MCI group and to have the same gender and age. Specifically, for each subject in P-MCI group, a subject with the same gender and age is sought in the other group. If this is not possible, the gender constraint is relaxed first, and then the age constraint. However, under no circumstances are subjects with an age difference greater than 3 years included. Due to the similar reason for the calculation of dynamic features, the subjects in the other clinical groups are required to have at least 4 time-points. The demographic and clinical information of all the selected subjects in the four groups are summarized in Table 1 . The preprocessing step performed in our method is a standard procedure as used in general brain image analysis, aiming to reduce image noise and remove non-brain tissues. For each image, intensity inhomogeneity is firstly corrected using the N3 algorithm (Sled et al., 1998) . Many algorithms have been developed to remove non-brain tissues, such as the skull and the extra-cranial tissues, including Brain Surface Extractor (BSE) (Shattuck and Leahy, 2001 ) and Brain Extraction Too",20


In [41]:
df["is_multi"].value_counts(normalize=True)

1    0.663519
0    0.336481
Name: is_multi, dtype: float64

In [40]:
df.to_parquet("output/train.parquet", index=False)