In [1]:
import pandas as pd
import json
import numpy as np
from utils import *
from utils import _jaccard_similarity
from tqdm.auto import tqdm
from transformers import *
from fuzzywuzzy import fuzz
import pickle
tqdm.pandas()

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
window_size = 32
max_sequence_length = 256
model_name = "gpt"
use_lower = False

!mkdir ../input/pickled
!mkdir ./models

In [9]:
# Spliting train data to overlapping windows
df = pd.read_csv("../input/train.csv")
train_df = pd.DataFrame()
texts = []
pub_titles = []
labels = []
ids = []
for idx, row in tqdm(df.iterrows(),total=len(df)):
    x = json.load(open(f"../input/train/{row.Id}.json","rt"))
    article = ""
    for section in x:
        raw_text = " ".join(section["text"].replace("\n", " ").split())
        article += raw_text
        article += " "
#     article =  clean_text(article)
    input_ids = tokenizer.encode(article, add_special_tokens=False)
    n_samples = math.ceil(len(input_ids)/(max_sequence_length - window_size))
    label = row.dataset_label
    for sample_idx in range(n_samples):
        start = max(0, (max_sequence_length - window_size)*sample_idx)
        end = start + max_sequence_length
        curr_ids = input_ids[start: end]
        curr_text = tokenizer.decode(curr_ids)
        texts.append(curr_text)
        if label in curr_text:
            labels.append(label)
        else:
            labels.append("")
        pub_titles.append(row.pub_title)
        ids.append(row.Id)
train_df["Id"] = ids
train_df["pub_title"] = pub_titles
train_df["text"] = texts
train_df["label"] = labels
train_df = train_df.fillna("")
train_df.to_csv(f"../input/train_processed_{max_sequence_length}.csv",index=False)

  0%|          | 0/19661 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2072 > 1024). Running this sequence through the model will result in indexing errors


In [10]:
train_df = pd.read_csv(f"../input/train_processed_{max_sequence_length}.csv")
train_df = train_df.fillna("")
train_df.head()

Unnamed: 0,Id,pub_title,text,label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,This study used data from the National Educati...,National Education Longitudinal Study
1,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,These programs are intended to improve colleg...,
2,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,author also examined whether students who ear...,
3,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,research described in this report meets WWC e...,
4,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,B. The study was based on secondary data from...,National Education Longitudinal Study


In [11]:
# # Finding missing abbreviation of training labels

acronyms = {
    "alzheimer's disease neuroimaging initiative": "adni",
    "alzheimer's disease cooperative study": "adcs",
    "baltimore longitudinal study of aging":  "blsa",
    "covid-19 open research dataset": "cord-19",
#     "census of agriculture": "coa",
    "characterizing health associated risks and your baseline disease in sars cov 2": "charybdis",
    "nces core of common data": "ccd",
    "complexity science hub covid 19 control strategies list": "cccsl",
    "coastal change analysis program": "c-cap",
    "noaa national water level observation network": "nwlon",
    "noaa sea lake and overland surges from hurricanes": "slosh",
    "international best-track archive for climate stewardship": "ibtracs",
    "optimum interpolation sea surface temperature": "oisst",
    "rural-urban continuum codes": "ruccs",
    "north american breeding bird survey": "bbs",
    "aging integrated database": "agid",
    "national institute on aging genetics of alzheimer s disease data storage site": "niagads",
#     "agricultural and resource management survey": "arm",
    "baccalaureate and beyond longitudinal study": "b&b",
    "early childhood longitudinal study": "ecls",
    "national longitudinal transition study": "nlts",
    "national education longitudinal studies": "nels",
    "high school longitudinal study": "hsls",
    "national assessment of education progress": "naep",
    "noaa world ocean database": "wod",
    "survey of doctorate recipients": "sdr",
    "international survey of doctoral recipients": "isdr",
#     "survey of earned doctorates": "sed",
    "survey of industrial research and development": "sird",
    "national teacher and principal survey": "ntps",
    "international assessment of adult competencies": "piaac",
    "rsna international covid 19 open radiology database": "ricord",
    "survey of doctorate recipients": "sdr",
    "school survey on crime and safety": "ssocs",
    "trends in international mathematics and science survey": "timss",
    "national postsecondary student aid survey": "npsas",
#     "postsecondary longitudinal studies" : "pls"
}
for key, val in acronyms.items():
    acronyms[key] = val.upper()

In [12]:
# Splitting to train and val
val_groups = [clean_text(x.strip()) for x in open("../input/val_groups.txt","rt").readlines()]

train_groups_orig = [x for x in train_df.label.unique() if clean_text(x) not in val_groups and len(x) > 0]
train_groups_orig.extend([x for x in acronyms.values()])
train_groups_orig  = list(set(train_groups_orig))
train_groups_orig = np.array(train_groups_orig)
train_groups_orig = train_groups_orig[np.argsort([len(x) for x in train_groups_orig])][::-1]

train_groups = [clean_text(x) for x in train_df.label.unique() if clean_text(x) not in val_groups and len(x) > 0]
train_groups.extend([clean_text(x) for x in acronyms.values()])
train_group  = list(set(train_groups))

val_groups = np.array(val_groups)
val_groups = val_groups[np.argsort([len(x) for x in val_groups])][::-1]
train_groups = np.array(train_groups)
train_groups = train_groups[np.argsort([len(x) for x in train_groups])][::-1]

In [13]:
def find_valid(row):
    text = clean_text(row.text)
    has_train = False
    has_val = False
    for label in train_groups:
        if label in text:
            if len(label.split()) > 1 or " "+label in text:
                has_train = True
                break
    for label in val_groups:
        if label in text:
            has_val = True
            break  
    # invalid samples are the ones that belong to both train and val set
    if has_train and has_val:
        return None
    if has_train and (not has_val):
        return "train"
    if has_val and (not has_train):
        return "val"
    # random negative sampling
    if not(has_train or has_val):
        return "train" if np.random.rand() > 0.2 else "val"
    
train_df["group"] = train_df.progress_apply(find_valid, axis=1)

In [15]:
val_df = train_df[train_df.group == "val"]
train_df = train_df[train_df.group == "train"]

In [16]:
def find_val_label(x):
    labels = []
    x = clean_text(x)
    for label in val_groups:
        if label in x:
            labels.append(label)
    return "|".join(labels)
val_df.label = val_df.text.progress_apply(find_val_label)
val_df = val_df.fillna("")
val_df.to_csv(f"../input/val_processed_{max_sequence_length}.csv",index=False)

  0%|          | 0/197751 [00:00<?, ?it/s]

In [14]:
def find_train_label(x):
    for label in train_groups_orig:
        if label in x:
            if len(label.split()) > 1 or " "+label+" " in x:
                return label
    return ""
train_df.label = train_df.text.progress_apply(find_train_label)

  0%|          | 0/816811 [00:00<?, ?it/s]

In [17]:
# We take all the positive samples and 5% of all negative samples
train_df["will_take"] = train_df.label.apply(lambda x: len(x) > 0 or np.random.rand() < 0.05)
train_df[train_df.will_take][train_df.columns[:-1]].to_csv(f"../input/train_sampled_{max_sequence_length}.csv",index=False)
train_df.head()

Unnamed: 0,Id,pub_title,text,label,group,will_take
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,This study used data from the National Educati...,National Education Longitudinal Study,train,True
1,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,These programs are intended to improve colleg...,,train,False
2,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,author also examined whether students who ear...,,train,False
3,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,research described in this report meets WWC e...,,train,False
4,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,B. The study was based on secondary data from...,National Education Longitudinal Study,train,True


In [18]:
from utils import *

In [19]:
# Tokenizing the validation set
val_df = pd.read_csv(f"../input/val_processed_{max_sequence_length}.csv")
val_df = val_df.fillna("")
if use_lower:
    val_df.text = val_df.text.apply(lambda x: x.lower())
outputs, type_outputs = convert_lines(tokenizer,val_df,is_test=True,max_sequence_length=max_sequence_length)
pickle.dump((outputs, type_outputs), open(f"../input/pickled/val_{model_name}_{max_sequence_length}.pkl","wb"))

  0%|          | 0/197751 [00:00<?, ?it/s]

In [4]:
## Data augmentation
train_df  = pd.read_csv(f"../input/train_sampled_{max_sequence_length}.csv").fillna("")
if use_lower:
    train_df.text = train_df.text.apply(lambda x: x.lower())
    train_df.label = train_df.label.apply(lambda x: x.lower())
    
# Most of augmentated titles come from Kaggle
kaggle_df = pd.read_csv("../input/kaggle_datasets.csv")
kaggle_df = kaggle_df[(kaggle_df.keywords.str.len() > 0) & (kaggle_df.license_name != "Unknown")]

def is_valid(x):
    return x[0].isalpha() and x[0].istitle() and "_" not in x and len(x.split()) > 1
db_names = list(set([x for x in kaggle_df.title.values if is_valid(x)]))[:7000]

# https://github.com/awesomedata/awesome-public-datasets
db_names.extend([x.strip() for x in open("../input/db_names.txt") if is_valid(x)])

# https://www.usda.gov/sites/default/files/documents/data.json
col_data = json.load(open("../input/data_set_citations.json","rt"))
all_mentions = []
for citation in col_data:
    mentions = citation["mention_list"]
    all_mentions.extend(mentions)
all_mentions = [x.encode("ascii","ignore") for x in all_mentions if len(x.split()) > 2 or np.random.rand() < 0.1]
db_names.extend(all_mentions)
db_names = list(set(db_names))
db_names = np.array(db_names)

if use_lower:
    db_names = [x.lower() for x in db_names]
print(len(db_names),db_names[-10:])

10226 ['National Survey of Midlife in the United States'
 'Study to Understand Prognoses and Preferences for Outcomes and Risks of Treatments'
 'World Values Surveys and European Values Surveys'
 'drug abuse treatment outcome study DATOS' 'MTS'
 'The Atlas of Economic Complexity' 'Active for Life'
 'National Survey of Adolescents'
 'Hispanic Health and Nutrition Examination Survey'
 'Community Tracking Study (CTS) household survey']


In [23]:
# Replacing original labels with augmented labels

new_texts = []
new_labels = []
for idx, row in tqdm(train_df.iterrows(),total=len(train_df)):
    if len(row.label.strip()) == 0:
        new_texts.append(row.text)
        new_labels.append("")
        continue
    try:
        start = row.text.index(row.label.strip())
    except Exception as e:
        new_texts.append(row.text)
        new_labels.append("")
        print(e)
        continue
    end = start + len(row.label.strip())
    new_label = row.label.strip()
    if np.random.rand() < 0.95:
        new_label = np.random.choice(db_names,size = 1)[0]
    try:
        new_texts.append(row.text[:start]+new_label+row.text[end:])
        new_labels.append(new_label)
    except Exception as e:
        print(e)
        break

  0%|          | 0/80656 [00:00<?, ?it/s]

substring not found
substring not found


In [24]:
train_aug = train_df.copy()
train_aug.text = new_texts
train_aug.label = new_labels

In [26]:
# Tokenizing the training set
outputs, type_outputs, position_outputs, offset_outputs, df = convert_lines(tokenizer,train_aug,max_sequence_length=max_sequence_length)
pickle.dump((outputs, type_outputs, position_outputs, offset_outputs), open(f"../input/pickled/train_aug_{model_name}_{max_sequence_length}.pkl","wb"))

  0%|          | 0/80656 [00:00<?, ?it/s]

[2351, 5882, 35118, 7276, 28095, 893] 2
[5401, 5652, 1012, 2334, 286, 27049, 406, 21963, 1112, 376, 8789] 2
[32519, 399, 19930] 2
[3611, 5483, 28095, 893] 2
[8108, 37169, 12481, 1594, 43240, 290, 7795, 2079, 37306, 28095, 893, 290, 7281, 1891, 28095, 893] 2
[29576, 273, 31084, 707, 1872] 2


In [2]:
!ls -halt ../input/pickled

In [28]:
val_df = pd.read_csv(f"../input/val_processed_{max_sequence_length}.csv")
val_df = val_df.fillna("")
for x in val_df.label.unique():
    for y in train_df.label.unique():
        if len(x) == 0 or len(y) == 0:
            continue
        if _jaccard_similarity(x,y) >= 0.5:
            print(x,"|",y)