In [1]:
import random
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List
from transformers import AutoTokenizer
from tqdm import tqdm
import scml
from scml import nlp as snlp
import mylib

In [2]:
percentiles = [.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [3]:
pretrained_dir = "pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [4]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19661 entries, 0 to 19660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             19661 non-null  object
 1   pub_title      19661 non-null  object
 2   dataset_title  19661 non-null  object
 3   dataset_label  19661 non-null  object
 4   cleaned_label  19661 non-null  object
dtypes: object(5)
memory usage: 768.1+ KB


In [5]:
gt_map: Dict[str, List[str]] = {}
dl_map: Dict[str, List[str]] = {}
for t in tqdm(train.itertuples()):
    pid = getattr(t, "Id")
    cl = getattr(t, "cleaned_label")
    dl = getattr(t, "dataset_label")
    if pid not in gt_map:
        gt_map[pid] = list()
    if pid not in dl_map:
        dl_map[pid] = list()
    gt_map[pid].append(cl)
    dl_map[pid].append(dl)

19661it [00:00, 154887.09it/s]


In [6]:
rows = []
sep = "|"
m = 900
max_negative_examples = 10
max_length = 502
step = 100
for rid, gts in tqdm(gt_map.items()):
    is_multi = 0
    if len(gts) > 1:
        is_multi = 1
    ground_truth = sep.join(sorted(gts))
    dataset_labels = sep.join(sorted(dl_map[rid]))
    with open(f"input/train/{rid}.json") as in_file:
        sections = json.load(in_file)
    tmp = []
    for section in sections:
        tmp.append(section["text"])
    text = " ".join(tmp).strip()
    text = snlp.to_ascii_str(text)
    spans = []
    for dl in dl_map[rid]:
        i = 0
        j = len(text)
        while i < j - len(dl) + 1:
            p = text.find(dl, i, j)
            if p == -1:
                break
            q = p + len(dl)
            spans.append((p, q))
            i = q
            # Right context: answer is on the left edge of window
            a = p
            b = 2 * m + a
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                b -= step * 2
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["answer_start"] = 0
            row["answer_end"] = len(dl)
            row["context"] = text[a:b]
            row["context_token_length"] = _len
            rows.append(row)
            # Left context: answer is on the right edge of window
            b = q
            a = max(0, b - (2 * m))
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                a += step * 2
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["answer_end"] = b - a
            row["answer_start"] = row["answer_end"] - len(dl) 
            row["context"] = text[a:b]
            row["context_token_length"] = _len
            rows.append(row)
            # Left and right context: answer is in the middle of window
            a = max(0, p - m)
            b = q + m
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                a += step
                b -= step
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["context"] = text[a:b]
            row["answer_start"] = row["context"].index(dl)
            row["answer_end"] = row["answer_start"] + len(dl)
            row["context_token_length"] = _len
            rows.append(row)
    i = 0
    negative_examples = int(len(text) / (2 * m)) - len(spans)
    negative_examples = min(max_negative_examples, negative_examples)
    while i < negative_examples:
        b = len(text) - 1 - (2 * m)
        a = random.randint(0, b)
        b = 2 * m + a
        is_overlap = False
        for span in spans:
            if a <= span[0] <= b or a <= span[1] <= b:
                is_overlap = True
                break
        if is_overlap:
            continue
        _len = len(tokenizer(text[a:b])["input_ids"])
        while _len > max_length and a < b:
            a += step
            b -= step
            _len = len(tokenizer(text[a:b])["input_ids"])
        row = {}
        row["Id"] = rid
        row["is_multi"] = is_multi
        row["ground_truth"] = ground_truth
        row["dataset_labels"] = dataset_labels
        row["is_impossible"] = 1
        row["context"] = text[a:b]
        row["answer_start"] = -1
        row["answer_end"] = -1
        row["context_token_length"] = _len
        rows.append(row)
        i += 1

  0%|          | 33/14316 [00:00<04:20, 54.73it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 14316/14316 [05:12<00:00, 45.83it/s]


In [7]:
df = pd.DataFrame.from_records(rows)
cols = ["is_multi", "is_impossible"]
df[cols] = df[cols].astype(np.int8)
cols = ["answer_start", "answer_end", "context_token_length"]
df[cols] = df[cols].astype(np.int16) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302398 entries, 0 to 302397
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Id                    302398 non-null  object
 1   is_multi              302398 non-null  int8  
 2   ground_truth          302398 non-null  object
 3   dataset_labels        302398 non-null  object
 4   is_impossible         302398 non-null  int8  
 5   answer_start          302398 non-null  int16 
 6   answer_end            302398 non-null  int16 
 7   context               302398 non-null  object
 8   context_token_length  302398 non-null  int16 
dtypes: int16(3), int8(2), object(4)
memory usage: 11.5+ MB


In [8]:
df["is_multi"].value_counts(normalize=True)

1    0.516892
0    0.483108
Name: is_multi, dtype: float64

In [9]:
df["is_impossible"].value_counts(normalize=True)

0    0.605183
1    0.394817
Name: is_impossible, dtype: float64

In [10]:
assert (df["context_token_length"] <= max_length).all()
df["context_token_length"].describe(percentiles=percentiles)

count    302398.000000
mean        381.133083
std          57.314879
min           4.000000
1%          114.000000
5%          317.000000
10%         334.000000
20%         350.000000
30%         362.000000
40%         373.000000
50%         383.000000
60%         394.000000
70%         406.000000
80%         421.000000
90%         443.000000
95%         461.000000
99%         488.000000
max         502.000000
Name: context_token_length, dtype: float64

In [11]:
df.sample(20).head(20)

Unnamed: 0,Id,is_multi,ground_truth,dataset_labels,is_impossible,answer_start,answer_end,context,context_token_length
22944,7d060b54-4ccb-4a03-8b9e-6c76a78baac1,0,agricultural resource management survey,Agricultural Resource Management Survey,0,900,939,"d more than 70 percent of production on feeder-to-finish farms were under production contracts in 2004. Likewise, 67 percent of specialized farrowing operations and more than 90 percent of specialized weanling operations used contractual arrangements. In contrast, contract production was virtually nonexistent on farrow-to-finish farms.\nThe average size of hog operations increased from 1992 to 2004 but grew the fastest for operations producing under contract ( fig. 6 ). Contract feeder pig-to-finish operations averaged about 1,000 more head produced in 1992 than did other operations. By 1998, contract operations averaged 3,700 more head than other operations, and the difference reached 4,500 head in 2004. Size of hog-finishing operations by business arrangement Head of hogs sold/removed per farm Sources: USDA, ERS using data from USDA's 1992 Farm Costs and Returns Survey and USDA 's 1998 Agricultural Resource Management Survey. 1998 1992 Other operations 0 2,000 4,000 6,000\n8,000\nContract hog-finishing operations produced 7,000 head annually on average in 2004, compared with 2,500 head on other operations.\nAmong farms with hogs, the average value of farm production from hog enterprises increased from 46 to 71 percent during 1992-2004 (see table 1 ). That is, hogs generated 71 percent of the total value of farm production on these farms. The share of farm product value from hogs increased most rapidly, from 35 to 72 percent, on hog-finishing operations. Sources of hog feed also indicate increasing specialization in hog production. Grain produced on the same farm accounted for half of the feed consumed by hogs in 1992 but fell below 20 percent by 2004. Again, specialized hogfinishing operations accounted for the fastest change (from 45 to 15 percent). The resulting farms-with greater shares of production value",378
58589,b17853de-dd78-47ce-b152-72f47f97bfb7,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNIGO/2 on each plate. Scaling was done by dividing the global average of each metabolite level by its average within the plate. These batch effect adjusted values are included in the Intermediate Data Level 2. Overall, this scaling factor was small (typically less than 10%), as can be seen by the raw reported NIST SRM-1950 values reported in Online-only Table 2 .\nThe next step of QC involved filtering based on quality metrics. We routinely applied filter criteria to each of the metabolites (based on the blinded ADNI 1 duplicates or ADNI 2/GO triplicates) to allow only the most robust analytes to be included in downstream analysis. Separately for each cohort, we used a coefficient of variation (CV) <30% across plates to filter out metabolites with limited variation and therefore statistical power for analysis. Next, we used an intraclass correlation coefficient (ICC) between the values for the blinded duplicate (or triplicate) analyes >0.6. Finally, analytes with >40% of measurements below the lower limit of detection (<LOD) were filtered out. This filtered data represents the Level 3 data matrix. This filtering reduced the total number of analytes reported from 20 analytes (Level 2) to 15 analytes (Level 3). The filter QC results are presented in detail in the Supplementary Table 1 .\nThe next step in data processing performs missing value replacement, by imputing any values reported as reported as '<LOD' were using LOD/2 value for each specific analyte. Additionally, we screened for outliers for removal prior to analysis. In ADNI 1, there were a total of 71 samples identified as outliers based on the following criteria: 69 samples identified as non-fasting, 2 samples lacking corresponding body mass index (BMI) values, and 1 for which no baseline medication record was",399
23165,8f26f413-cd04-4408-a52e-d4b74a5b851d,1,agricultural resource management survey|census of agriculture,Agricultural Resource Management Survey|Census of Agriculture,0,1761,1800,"factors beyond their control, such as weather (Carter 1998 , Fuller 1990 , Veeck, Che, and Veeck 2006 . Many agritourism activities make use of land unsuitable for crop and livestock production. Consequently, adding agritourism to a farm's operations does not necessarily displace crop and livestock production. In addition, agritourism mainly involves domestic suppliers and consumers; it suffers less than other agricultural commodities from global competition (Veeck, Che, and Veeck 2006) .\nThe local focus of agritourism can benefi t surrounding communities. Farm visitors who purchase local goods and services (including farm products) stimulate the community's economy 1 In our analysis explaining farmers' involvement in agritourism, we ran variations of the model that included individual dummy variables to represent farms specializing in grain, vegetables, fruits, cattle and calves, and dairy production. None of the resulting coeffi cients (shown with t-values in parentheses) was signifi cant: grains, 0.0047 (0.0114); cattle and calves, -0.047 (-0.192); dairy, -0.127 (-0.187); vegetables, -0.121 (-0.156); fruits, 0.183 (0.332) . These results suggest that agritourism does not signifi cantly displace these crop and livestock operations. (Saxena et al. 2007 ). As agritourism attracts urban dwellers to rural areas, urban tourists, especially teens and young adults, have an opportunity to develop an appreciation for rural people and places through their interactions with rural residents. In addition, agritourism can enhance a ""sense of place"" for local residents, giving them more reasons to stay and invest in their communities.\nThe magnitude of the local economic impacts of agritourism farms can be gauged by examining data from the 2007 Agricultural Resource Management Survey",404
1391,d0903d3f-9d76-4ad6-a145-0b10d4da1b46,1,education longitudinal study|national education longitudinal study,Education Longitudinal Study|National Education Longitudinal Study,1,-1,-1,"e as Group 4 except that the SAT-Math requirement is lowered to 600 (or 26 on ACT-Math), and the AP exam criterion is lowered to include those with scores of 3 or higher. This increases the group size to 28,000 and lowers the percent completing STEM bachelor's degrees to 79%-only a slight decline.\nGroup 6 is the same as Group 5 except that an AP score of 3 or more is needed-we do not substitute a science fair award for an AP score. Although this causes the group size to fall to 23,000, this more restrictive group does not have a higher percentage completing a STEM degree, but rather the percentage completing STEM degrees falls slightly to 77%. This suggests that a science fair award in high school is a useful predictor of STEM bachelor's degree completion. Further, it suggests that Group 6 is unnecessary. Group 5 is closer to the desired size and uses a relevant variable, science fair awards, that is not used in Group 6.\nThese findings are consistent with Lubinski and Benbow's (2006) study of mathematically precocious youth. For 13-year-old students testing in the top 1% of mathematic ability (approximately 30,000 students nationwide), Lubinski and Benbow found 62% of males and 54% of females eventually earned STEM baccalaureates. Statistics for those testing in the top 0.5% (approximately 15,000 students nationally) that earn STEM degrees are 76% of males and 61% respectively. Depending on definition, somewhere between 60% to 82% of likely PACE winners are completing STEM degrees without the PACE program. The direct marginal impact of the program would depend on the portion of non-STEM degree recipients who would complete a STEM degree because of PACE.\nSome, but not all, of the best students can be influenced to complete a STEM degree by prestigious scholarship awards.",386
195275,bd369b98-cb2e-4752-9950-a8a7c8f85e0e,0,alzheimer s disease neuroimaging initiative adni,Alzheimer's Disease Neuroimaging Initiative (ADNI),1,-1,-1,"ter-relationships between modalities. The proposed model is applied to evaluate the predictive power of MRI and CSF proteomic measurements towards cognitive outcomes. The empirical results demonstrate significant improvements over the state-of-the-arts competing models, and also yield stable multimodal biomarkers across cross-validation trials. We write matrices as boldface uppercase letters and vectors as boldface lowercase letters. Given a matrix M = (m ij ), its i-th row and j-th column are denoted as m i and m j respectively. The Frobenius norm and 2,1 -norm (also called as\nWe focus on multi-task learning paradigm, where MRI and CSF measures are used to predict one or more cognitive outcomes. Let {x 1 , , x n } d be MRI and CSF measures and {y 1 , , y n } c cognitive outcomes, where n is the number of samples, d is the number of predictors (feature dimensionality) and c is the number of response variables (tasks). Let X = [x 1 , . . . , x n ] and Y = [y 1 , . . . , y n ].\nThe 2,1 norm [4] is a multi-task version of traditional lasso. While lasso only focuses on the feature level sparsity, The 2,1 norm is proposed to couple multiple tasks together in addition to the original sparsity property: Yet in this model the rows of W are equally treated, which ignores the structures among predictors. Group-Sparse Multitask Regression and Feature Selection (G-SMuRFS) method [5] was proposed to exploit the structures within and between the predictors and response variables. It assumes 1) a partition scheme exists among predictors, and 2) predictors within one partition should have similar weights. G-SMuRFS can be thought of as a multi-task version of group lasso.\nIn practice, the relationship among predictors may not be as simple as a straightforward partition used by G",423
26432,aff7adeb-a6c0-46d6-b10f-ed57a679ce03,1,beginning postsecondary students|beginning postsecondary students longitudinal study,Beginning Postsecondary Students|Beginning Postsecondary Students Longitudinal Study,1,-1,-1,"n terms of high school math classes taken, high school grades, and SAT scores.\nCeteris paribus, more undergraduates who enter community college stop out-fail to enroll for a semester or more-during the first 6 years after entering college (a 10.47 percentage point difference). Two-year entrants are also significantly more likely to have a job during their first year of college, and (among those who do have a job) to work longer hours per week during term time. We have demonstrated that community college students are substantially less likely to earn a bachelor's degree than similar 4-year students after matching on a broad set of observable characteristics. But how sensitive are these results to the presence of an unmeasured confounder?\nWhen conducting a sensitivity analysis, we ask two questions simultaneously. The first has to do with the size of the hypothetical selection effect: How much more common would an unmeasured characteristic have to be in the treatment rather than the control group (or vice versa) to eliminate the observed treatment effect? We are asking here about the impact of the unobserved variable on assignment to treatment. The second has to do with the effect on the outcome: How much more common must an unmeasured confounder be among ""successes"" (BA-earners) than ""failures"" (non-earners) to account for observed differences? The unobserved characteristic has to affect both selection into treatment and the outcome (probability of ""success"") to be a confounder. A variable which has a strong relationship with the outcome but which is present in equal quantities in the treatment and control groups would not have any impact on treatment effects. Neither would a variable which is more commonly encountered in the treatment than in the control group but which",370
129614,0299b2af-66a8-47df-8f8c-11ddeabf22aa,0,adni,ADNI,0,1796,1800,"tive and accurate methods to detect AD at earlier stages and mark its progress through biomarkers. The initial goal of ADNI was to recruit 800 subjects, ages 55-90, including 200 normal controls, 400 individuals with MCI, and 200 subjects with mild AD at approximately 50 sites in the United States and Canada for longitudinal follow up. ADNI also aims to accurately track progression of the disease and devise tests to measure the effectiveness of potential interventions. Currently, the study involves over 1,000 participants, including people without memory problems, those with MCI, and patients with diagnosed AD. Early diagnosis of AD is key to the development, assessment, and monitoring of new treatments for AD. Approaches to characterize AD progression will help researchers and clinicians to develop new treatments and monitor their effectiveness. Further, being able to understand disease progression will increase the safety and efficacy of drug development and potentially decrease the time and cost of clinical trails. The ADNI project is a longitudinal study, where selected subjects are categorized into three BL diagnostic groups: CN, MCI, and AD, repeatedly over a 6-month or 1-year interval. The date when the subjects are scheduled to perform the screening becomes BL after approval and the time point for the follow-up visits is denoted by the duration starting from the BL. We use the notation Month 6 (M6) to denote the time point half year after the first visit. Currently, ADNI has up to Month 48 follow-up data available for some patients. However, many patients drop out from the study for many reasons.\nIn this work, we conduct empirical evaluation for the proposed methods on MRI data. The MRI features used in our experiments are based on the imaging data from the ADNI",355
107136,b5fecf00-46a7-4a62-b03c-b79599a68cb8,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI-1 in other independent data sets [37] .\nPrior PMD has received research grants and/or advisory fees from several government agencies, advocacy groups and pharmaceutical/imaging companies. PMD received a grant from ADNI to support data collection for this study and he owns stock in Sonexa, Maxwell, Adverse Events and Clarimedix, whose products are not discussed here.",81
169885,ce895b10-9305-4baf-97e7-473a69c77d9f,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),1,-1,-1,"han 1.5). The curves in 269 black are average values (1 standard deviation) of the individualized inferred seeds. The curves in red 270 correspond to the inferred seed from the population average atrophy-pattern. The discrepancy between 271 curves for higher lambda values indicate the steps of the algorithm do not commute, i.e., that finding seeds 272 and calculating their average (black) will lead to results that are significantly different than averaging the 273 atrophies and inferring a seed from that averaged value. This strongly advocates for seed inference using 274 patient-specific atrophy patterns as opposed to population averages, although the two curves are not too 275 far apart for our selected value of = 0.25. All inferred seeds presented in our plots and tables were 276 found using this value. i.e., that finding seeds and calculating their average (black) will lead to results that are significantly 288 different than averaging the atrophies and inferring a seed from that averaged value. Notice that an 289 increase in sparsity is correlated with an increase in the data/model mismatch and vice-versa. Thus, we 290 choose lambda = 0.25 as it yields both a tractable number of non-zero entries in the seeds vector (6 out 291 of 78) and a tolerable mismatch (value = 0.63) 292 3.3 Individualized seeds are more predictive of future atrophy 293 The inferred seed vectors varied significantly from patient to patient. In this subsection, we demonstrate 294 that inferred seed are more predictive of atrophy pattern than a common seed located at a single region, The algorithm is as follows: respectively. The R-max values associated with (i) are typically higher than the ones associated with (ii) 314 and (iii), demonstrating that inferred individualized seed patterns lead to",403
189617,c9023ee5-25c3-453a-96de-9564767c34e1,0,trends in international mathematics and science study,Trends in International Mathematics and Science Study,0,1747,1800,"probability of failure as opposed to just a statement which tells them that they are 'at risk' or that they are not 'at risk'. It provides a student with some insight into the extent to which they may need extra mathematical support, if at all, as opposed to telling them that they do or do not need it. In addition to this, the discriminant function is a classification method which is evidence based as opposed to the diagnostic test classification which is based on a subjective expert opinion only. The prediction analysis carried on in this research therefore did improve somewhat on the current system that was in place (i.e the diagnostic test). The discriminant analysis also helped to refine the cut-off point of the diagnostic test slightly by establishing that a student who receives 18/40 or below is 'at risk' of failing service mathematics as opposed to the 19/40 cut-off point which is currently in operation. The examination of students' probabilities of failure in service mathematics on entry to higher education and allowing for appropriate mathematics remediation to be put in for specific cohorts of students is one way of attempting to maintain degree standards in higher education.\nThe fourth and final research question contained within this research was an examination into what effect (if any) gender had on mathematics attainment in 3 rd level education. The effect of gender on mathematics attainment and consequently its strength as a predictor of service mathematics performance was found to be negligible.\nThis finding reiterates what was outlined in a meta-analysis of 100 studies between 1963-1988 in which gender differences in mathematics performance were not found to be significant [61] . A more recent TIMSS (Trends in International Mathematics and Science Study",352


In [12]:
df.to_parquet("output/train.parquet", index=False)