In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List
from transformers import AutoTokenizer
from tqdm import tqdm
import scml
from scml import nlp as snlp
import mylib

In [2]:
percentiles = [.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [3]:
pretrained_dir = "pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [4]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19661 entries, 0 to 19660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             19661 non-null  object
 1   pub_title      19661 non-null  object
 2   dataset_title  19661 non-null  object
 3   dataset_label  19661 non-null  object
 4   cleaned_label  19661 non-null  object
dtypes: object(5)
memory usage: 768.1+ KB


In [5]:
gt_map: Dict[str, List[str]] = {}
dl_map: Dict[str, List[str]] = {}
for t in tqdm(train.itertuples()):
    pid = getattr(t, "Id")
    cl = getattr(t, "cleaned_label")
    dl = getattr(t, "dataset_label")
    if pid not in gt_map:
        gt_map[pid] = list()
    if pid not in dl_map:
        dl_map[pid] = list()
    gt_map[pid].append(cl)
    dl_map[pid].append(dl)

19661it [00:00, 149757.67it/s]


In [6]:
rows = []
sep = "|"
m = 650
negative_examples = 3
max_length = 500
step = 50
for rid, gts in tqdm(gt_map.items()):
    is_multi = 0
    if len(gts) > 1:
        is_multi = 1
    ground_truth = sep.join(sorted(gts))
    dataset_labels = sep.join(sorted(dl_map[rid]))
    with open(f"input/train/{rid}.json") as in_file:
        sections = json.load(in_file)
    tmp = []
    for section in sections:
        tmp.append(section["text"])
    text = " ".join(tmp).strip()
    text = snlp.to_ascii_str(text)
    spans = []
    for dl in dl_map[rid]:
        i = 0
        j = len(text)
        while i < j - len(dl) + 1:
            p = text.find(dl, i, j)
            if p == -1:
                break
            q = p + len(dl)
            spans.append((p, q))
            i = q
            # Right context: answer is on the left edge of window
            a = p
            b = 2 * m + a
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                b -= step * 2
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["answer_start"] = 0
            row["answer_end"] = len(dl)
            row["context"] = text[a:b]
            row["context_token_length"] = _len
            rows.append(row)
            # Left context: answer is on the right edge of window
            b = q
            a = max(0, b - (2 * m))
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                a += step * 2
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["answer_end"] = b - a
            row["answer_start"] = row["answer_end"] - len(dl) 
            row["context"] = text[a:b]
            row["context_token_length"] = _len
            rows.append(row)
            # Left and right context: answer is in the middle of window
            a = max(0, p - m)
            b = q + m
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                a += step
                b -= step
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["context"] = text[a:b]
            row["answer_start"] = row["context"].index(dl)
            row["answer_end"] = row["answer_start"] + len(dl)
            row["context_token_length"] = _len
            rows.append(row)

  2%|▏         | 347/14316 [00:02<01:42, 136.02it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 14316/14316 [02:18<00:00, 103.69it/s]


In [7]:
df = pd.DataFrame.from_records(rows)
cols = ["is_multi", "is_impossible"]
df[cols] = df[cols].astype(np.int8)
cols = ["answer_start", "answer_end", "context_token_length"]
df[cols] = df[cols].astype(np.int16) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183006 entries, 0 to 183005
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Id                    183006 non-null  object
 1   is_multi              183006 non-null  int8  
 2   ground_truth          183006 non-null  object
 3   dataset_labels        183006 non-null  object
 4   is_impossible         183006 non-null  int8  
 5   answer_start          183006 non-null  int16 
 6   answer_end            183006 non-null  int16 
 7   context               183006 non-null  object
 8   context_token_length  183006 non-null  int16 
dtypes: int16(3), int8(2), object(4)
memory usage: 7.0+ MB


In [8]:
df["is_multi"].value_counts(normalize=True)

1    0.663519
0    0.336481
Name: is_multi, dtype: float64

In [9]:
assert (df["context_token_length"] <= max_length).all()
df["context_token_length"].describe(percentiles=percentiles)

count    183006.000000
mean        281.328645
std          47.780126
min           4.000000
1%           73.000000
5%          227.000000
10%         244.000000
20%         258.000000
30%         267.000000
40%         274.000000
50%         282.000000
60%         290.000000
70%         299.000000
80%         311.000000
90%         328.000000
95%         345.000000
99%         393.000000
max         500.000000
Name: context_token_length, dtype: float64

In [10]:
df.sample(20).head(20)

Unnamed: 0,Id,is_multi,ground_truth,dataset_labels,is_impossible,answer_start,answer_end,context,context_token_length
50237,dce3207b-8aac-4b80-8f9b-2657feb1a810,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,650,654,"her the height scores calculated using the 2910 SNPs were correlated with the self-reported heights (at age 18) for the 407 individuals for which we have both height and genetic data. We failed to detect significant correlation between the two (correlation coefficient = 0.06, p = 0 25; Figure 3 ). This is consistent with the findings of the GIANT consortium. With a population of 1914 individuals, Wood et al. found a predictive r 2 = 0 14 for 697 SNPs (20% variation explained) [10] . It is expected that this r 2 should be stronger than the correlation coefficient in our findings because of our smaller population size of 407 individuals of the ADNI and Cache County individuals as well as the fact that the GIANT consortium identified the 697 SNPs used for prediction directly from their population of 1914 individuals. While research has shown that height is a polygenic trait heavily influenced by common SNPs [7] [8] [9] [10] [11] [12] , a polygenic score that quantifies common SNP effect is generally insufficient for successful individual phenotype prediction. We demonstrate that in the case of Mr. Bradley, a rare combination of common SNPs corresponds to an extremely high polygenic score that predicts an extreme phenotype. Because Mr. Bradley is an outlier, studying his genetic makeup p",285
9838,a5d51418-49db-49b5-ba2d-60b1b73419d9,1,baltimore longitudinal study of aging|baltimore longitudinal study of aging blsa,Baltimore Longitudinal Study of Aging|Baltimore Longitudinal Study of Aging (BLSA),0,1263,1300,"rt of this prototyping paradigm is the ""instrumentation"" (4.) window. When we consider a prototype from the engineering perspective, it seems natural that the capability exists to extract performance information from the executing program, whether it is considered an experiment, as in [I] and [9], or a model. Observing the output at this window, the designer is able to observe and analyze data extracted from the executing prototype, including operational parameters, performance indicators, and whatever other instrumentation is incorporated into the prototype, and track the progress of the prototype. The second part of our hypothesis concerns the flexibility of the model and the ways in which RBP can benefit the computer software development process. Consideration of this issue is the area in which our work has the potential to make a contribution to the body of knowledge in computer science. Our goal in this part of our project is to provide new insights into prototyping and reconfiguration that can be used as reference material for further experimentation with the RBP approach.\nCentral to our contention that prototyping can be more effectively undertaken using our model is the presentation of a case-in-point. We will draw an example from the Baltimore Longitudinal Study of Aging",246
770,cd1523cc-178d-4498-a35b-bbf53fa4fe6e,1,education longitudinal study|national education longitudinal study,Education Longitudinal Study|National Education Longitudinal Study,0,650,678,"gration to community colleges, with Latinos attending because of the presence of their friends and family (Perez & McDonough, 2008; Person & Rosenbaum, 2006) .\nAnother theme in the literature is that Latinos go to community colleges because they lack academic preparation. Although school-age Latinos have had lower reading and math scores compared with other students, they have been making steady gains in terms of grade point averages, standardized test scores, and high school credits earned (Llagas & Snyder, 2003) . However, academic credentials that make one eligible for a 4-year college do not automatically lead to enrollment. The National Education Longitudinal Study (NELS:88; National Center for Education Statistics [NCES], n.d.-b) data showed that ""almost 40 percent of Latino students considered worthy academically for four-year studies failed to enroll at a four-year institution"" (Swail et al., 2004, p. 18 ). Kurlaender (2006) was able to demonstrate that Latinos scoring close to 100% on the NELS:88 math achievement test still had a 40% to 50% probability of choosing a 2-year over a 4-year college, compared with less than a 10% rate for African Americans and European Americans. Furthermore, other researchers have found that high-achieving Latinos in high school are the least likely of all racial/ethni",290
8652,01320f4d-0ee7-4baa-8476-167a80a25b69,1,baltimore longitudinal study of aging|baltimore longitudinal study of aging blsa,Baltimore Longitudinal Study of Aging|Baltimore Longitudinal Study of Aging (BLSA),0,0,44,"Baltimore Longitudinal Study of Aging (BLSA) [16] found that episodic memory declined at similar rates, with change points 8 years and up to 15 years prior to diagnosis, respectively. However, Riley and colleagues [20] found faster rates of cognitive decline in semantic memory and visuospatial construction in future AD patients' compared to individuals who remained cognitively healthy. Other studies have likewise reported change points in visuospatial functioning [21, 22] and semantic memory functioning [22] . Taken together, longitudinal studies of cognitive functioning in future AD patients suggest that bilateral medial and anterior temporal lobe as well as frontal lobe dysfunction underpin the earliest cognitive impairments in AD, consistent with the sites of beginning neurofibrillary pathology [23] , and that these changes appear approximately seven years prior to diagnosis.\nEarly pathological changes in AD may be manifested in subtle, qualitative neuropsychological dysfunction, before quantitative impairments emerge [2] . While most longitudinal studies have not addressed this hypothesis, the neuropsychological literature has documented qualitative abnormalities in AD patients' neuropsychological performance that represent potential candidates for qualitative preclinical mar",248
74209,92daff30-8ebe-41de-8806-6d811108dcba,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,1296,1300,"misclassified 24% of a sample when compared with comprehensive criteria that required 2 scores 1SD below the mean within a cognitive domain illustrates. 16 Although requiring 2 low scores can help overcome the effects of normal cognitive variability, the optimal number of tests to use depends upon the number and type of measures in the test battery.\nIndividuals with more low scores than expected for the BRLS would be more likely to have true impairment rather than simply exhibiting normal variability in cognitive performance. Making allowances for low scores in line with the BRLS may help to minimize false positives when diagnosing MCI. The aim of this study was to test this idea by comparing the risk-AD of participants classified with MCI using standard approaches and when the BRLS is considered. In using the BRLS for diagnosing MCI, we expected to find individuals with MCI to have a higher risk-AD than normal controls and better prediction of progression to AD than with standard criteria. Data were obtained from the Alzheimer's Disease Neuroimaging Initiative (ADNI) database (adni.loni.usc.edu), launched in 2003 as a public-private partnership and led by Principal Investigator Michael W. Weiner, MD. The first ADNI period (ADNI1) was updated in two subsequent grant periods (ADNI",270
131766,90d61671-c4cb-4e2e-83d1-ab20dcc336ea,0,trends in international mathematics and science study,Trends in International Mathematics and Science Study,0,0,53,Trends in International Mathematics and Science Study,9
72808,fded0d8b-b940-4d05-a9b3-b95835330843,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,1296,1300,"sions of total cohort tends to the results arisen from MCI, which has the most case number. This might lead to untrue finding for the entire cohort. Hence, more studies are needed to perform to fill the gaps.\nIn general, it is possible that HMGCR (rs3846662) could be involved in the structural and functional modification of right entorhinal and left hippocampus throughout the AD physiopathological process. What's more, it influedced glucose metabolism of right temporal. HMGCR (rs3846662) plays an important role in AD-related neurodegenerative processes. Therefore, identification of specific factors that regulate HMGCR alternative splicing and elucidating the underlying mechanism may lead to a better understanding of its impact on regulating cellular cholesterol homeostasis and neurodegenerative processes. It is necessary to explore more clear understanding on the mechanisms in a larger sample, with longer follow-up and a wider range of people. Data used in this article were got from the ADNI database (www.loni.ucla.edu\ADNI). The ADNI is an ongoing, longitudinal, multicenter study aimed at developing clinical, genetic, serial magnetic resonance imaging (MRI), positron emission tomography (PET), and biochemical biomarkers to measure the progression of MCI and early AD [19] . ADNI",306
141091,be980a33-c9f3-469f-b39d-803c07961821,0,survey of earned doctorates,Survey of Earned Doctorates,0,1273,1300,"nted the removal of mandatory retirement in different years, many prior to becoming a federal law. Comparing and contrasting hiring and average age in these institutions in different time periods provide insights into the effects of late retirement on US faculty workforce. The second study confirms conclusions of the simulation model. Overall, this study offers a different perspective to explain a source of problems of early career scientists, and the interconnections in the science workforce, stressing the inadequacy of looking at the young faculty population in isolation. The study shows the magnified effects of late retirement on the growth in average academic age through two mechanisms-a longer stay of established professors and a decline in the hiring rate of newly minted PhDs. We discuss our findings and offer several policy implications to help early career scientists in the U.S. Data. We use the Survey of Doctorate Recipients (SDR), a longitudinal survey of the science workforce population with doctoral degrees in science, engineering, and health earned in the U.S (http://www.nsf.gov/statistics/srvydoctorates/). The SDR is sponsored by the National Science Foundation (NSF), and is usually administered every two years. The sampling frame is the Survey of Earned Doctorates",259
60,c754dec7-c5a3-4337-9892-c02158475064,1,education longitudinal study|national education longitudinal study,Education Longitudinal Study|National Education Longitudinal Study,0,0,37,"National Education Longitudinal Study (NELS), we estimate a value-added education production function that includes parental effort as an input. Parental effort equations are also estimated as a function of child, parent, household, and school characteristics. Our results suggest that parental effort has a strong positive effect on achievement that is large relative to the effect of school resources and is not captured by family background variables. Parents appear to reduce their effort in response to increased school resources, suggesting potential ''crowding out'' of school resources. There is a long-standing debate whether improving school financial resources will improve student achievement. Some have found positive effects (Hedges and Greenwald 1996; Krueger 1999) while others have found negligible or even negative effects (see Hanushek 1996) . Researchers have focused on specific factors such as teacher characteristics (for example, Rivkin, Hanushek, and Kain 2005) , peer effects (Hanushek et al. 2003) , class size (Angrist and Lavy 1999; Hoxby 2000) , or birth order and family size (Hanushek 1992) .\nIn this paper, we investigate another important factor in student achievementparental involvement-and the role it plays in student achievement. We also examine the factors ass",259
92505,35e7c21b-a4aa-43fb-89ee-551aacd078c8,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,ADNI investigators is available at www.loni.ucla.edu/ADNI/Collaboration/ADNI_Citatation.shtml.,34


In [11]:
df.to_parquet("output/train.parquet", index=False)