In [1]:
import math
import re 
import string
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import pickle as pkl

from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaTokenizerFast, RobertaModel
import torch

TODO : 

    - Check Acronyms 
    - Avoid Overfitting on Dataset names

## 1 - Chunking Documents

In [2]:
coleridge_df = pd.read_csv('./data/dataset/coleridge_df.csv', index_col = 0)
coleridge_test = pd.read_csv('./data/dataset/coleridge_test.csv', index_col = 0)
coleridge_df = coleridge_df.rename({'body':'document', 'dataset_label' : 'answer'})
coleridge_test = coleridge_test.rename({'body':'document'})

In [3]:
coleridge_df

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,body
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,This study used data from the National Educati...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Dropping out of high school is not necessarily...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,", stress satisfactory outcomes for all youth,..."
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Federal Reserve Bank of Richmond S1. Accountin...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,This article investigates an important factor ...
...,...,...,...,...,...,...
19656,b3498176-8832-4033-aea6-b5ea85ea04c4,RSNA International Trends: A Global Perspectiv...,RSNA International COVID-19 Open Radiology Dat...,RSNA International COVID Open Radiology Database,rsna international covid open radiology database,Our lives have been fundamentally altered this...
19657,f77eb51f-c3ac-420b-9586-cb187849c321,MCCS: a novel recognition pattern-based method...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...,"The outbreak of the coronavirus disease 2019 ,..."
19658,ab59bcdd-7b7c-4107-93f5-0ccaf749236c,Quantitative Structure–Activity Relationship M...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...,The ongoing COVID-19 pandemic has challenged t...
19659,fd23e7e0-a5d2-4f98-992d-9209c85153bb,A ligand-based computational drug repurposing ...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...,deployment of approximative mathematical model...


### Train - Dev

In [1]:
def replace_misspells_answer(document, answer) :
    
    # Handle special case by hand
    if 'Alzheimer' in document :
        document = document.replace('\'s', 's')
        
    if 'NOAA' in document : 
        document = document.replace('NOAA / National', 'NOAA National')
        document = document.replace('water-level', 'water level')
        document = document.replace('NOAA / National Water Level Observation Network',  'NOAA National Water Level Observation Network')
        
    if 'Baltimore Longitudinal Study of Aging' in document : 
        document = document.replace('Baltimore Longitudinal Study of Aging, BLSA', 'Baltimore Longitudinal Study of Aging (BLSA)')
        document = document.replace('Baltimore Longitudinal Study of Aging (BLSA; represented below the gene)', 'Baltimore Longitudinal Study of Aging (BLSA)')
        document = document.replace('Baltimore Longitudinal Study of Aging (BLSA-3T)', 'Baltimore Longitudinal Study of Aging (BLSA)')
        document = document.replace('BALTIMORE LONGITUDINAL STUDY OF AGING', 'Baltimore Longitudinal Study of Aging (BLSA)')
    
    reg = answer.replace('(', '\(', )
    reg = reg.replace(')', '\)', )
    
    # Clean punctuation surrounding dataset name
    re1 = '.' + reg + '.'
    document = re.sub(re1, ' '+answer+' ', document)
    
    # Case sensitive
    document = re.sub(reg, answer, document, flags = re.I)
    
    # Accronym punctued cleaning
    accr = answer.split()[0]
    document = re.sub('.'+accr+'.', ' '+accr+' ', document)
    
    # Replace double whitespaces
    document = document.replace('  ', ' ')
    
    return document

def clean_text(text):
    punct = string.punctuation.replace("'","")
    text = re.sub('[%s]' % re.escape(punct), ' ', str(text).lower())
    text = re.sub('[\d]', ' ', text)
    text = re.sub(' [\w]{1} ', ' ', text)
    text = re.sub('\s+([a-zA-Z]\s+)*', ' ', text)
    text = " ".join(text.split())
    return text

def find_answer(document, answer) :
    document = document.split()
    answer = answer.split()
    
    all_positions = []
    for i in range(len(document)-len(answer)+1) : 
        flag = False
        if document[i:i+len(answer)] == answer : 
            flag = True
            
        if flag == True : 
            all_positions.append([i,i+len(answer)])
                
    return all_positions

def chunking_documents(documents, answers, tokenizer, max_seq_length, sliding_window) : 
    
    documents_chunked = []
    aug_answers = []
    all_answers_pos = []
    for document, answer in tqdm(zip(documents, answers)) : 
        
        document = replace_misspells_answer(document, answer)
        
        input_ids = tokenizer.encode(document, add_special_tokens = False)
        document_length = len(input_ids)
        
        chunks, chunks_answers_pos, chunk_ans = [], [], []
        for i in range(0, math.ceil((document_length - max_seq_length) / sliding_window) + 1): 
            chunk = input_ids[i*sliding_window:(i*sliding_window) + max_seq_length]
            
            chunk_doc = tokenizer.decode(chunk)
            answer_pos = find_answer(clean_text(chunk_doc), clean_text(answer))
            
            if answer_pos == [] : 
                chunk_ans.append('')
                
            else : 
                chunk_ans.append(answer)
            
            chunks.append(chunk_doc)
            chunks_answers_pos.append(answer_pos)
            
        all_answers_pos.append(chunks_answers_pos)
        documents_chunked.append(chunks)
        n_chunks = len(chunks)
        aug_answer = [answer] * n_chunks
        aug_answers.append(aug_answer)
        
    return documents_chunked, aug_answers, all_answers_pos

In [5]:
documents = coleridge_df.body.to_numpy()
answers = coleridge_df.dataset_label.to_numpy()

max_seq_length = 490
overlapping_size = 50
sliding_window = max_seq_length - overlapping_size
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

documents_chunked, answers, answers_pos = chunking_documents(documents, answers, tokenizer, max_seq_length, sliding_window)

chunked_dataset = {
    'document' : documents_chunked,
    'answer' : answers,
    'answer_pos' : answers_pos
}

chk_coleridge_df = pd.DataFrame(chunked_dataset)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Token indices sequence length is longer than the specified maximum sequence length for this model (2072 > 512). Running this sequence through the model will result in indexing errors





In [6]:
chk_coleridge_df = chk_coleridge_df.explode(['document', 'answer', 'answer_pos'])
chk_coleridge_df

Unnamed: 0,document,answer,answer_pos
0,This study used data from the National Educati...,National Education Longitudinal Study,"[[6, 10]]"
0,Appendix D for more information). The study a...,National Education Longitudinal Study,[]
0,collected in 2000. For a more detailed descri...,National Education Longitudinal Study,"[[71, 75], [103, 107]]"
0,family income and prior achievement. In respo...,National Education Longitudinal Study,[]
0,the improvement index represents the gain or ...,National Education Longitudinal Study,[]
...,...,...,...
19660,to evaluate compound similarity by quantifyin...,CAS COVID-19 antiviral candidate compounds data,[]
19660,"used as an input. For example, remdesivir was...",CAS COVID-19 antiviral candidate compounds data,[]
19660,(2 unique Murcko scaffolds) with activity < 1...,CAS COVID-19 antiviral candidate compounds data,[]
19660,"COVID-19 pandemic, this course was conceptual...",CAS COVID-19 antiviral candidate compounds data,[]


In [7]:
def check_has_answer(answer_pos) : 
    if len(answer_pos) == 0 :
        return False 
    else : 
        return True
    
chk_coleridge_df['has_answer'] = chk_coleridge_df['answer_pos'].apply(check_has_answer)

In [8]:
pkl.dump(chk_coleridge_df, open('./data/dataset/chunked_coleridge_df.pkl', 'wb'))

### Test

In [14]:
def chunking_documents_inference(documents, tokenizer, max_seq_length, sliding_window) : 
    
    documents_chunked = []
    for document in tqdm(documents) : 
        
        input_ids = tokenizer.encode(document, add_special_tokens = False)
        document_length = len(input_ids)
        
        chunks = []
        for i in range(0, math.ceil((document_length - max_seq_length) / sliding_window) + 1): 
            chunk = input_ids[i*sliding_window:(i*sliding_window) + max_seq_length]
            
            chunk_doc = tokenizer.decode(chunk)
            
            chunks.append(chunk_doc)
            
        documents_chunked.append(chunks)
        
    return documents_chunked

In [18]:
documents = coleridge_test.body.to_numpy()

max_seq_length = 490
overlapping_size = 50
sliding_window = max_seq_length - overlapping_size
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

test_documents_chunked = chunking_documents_inference(documents, tokenizer, max_seq_length, sliding_window)

test_chunked_dataset = {
    'document' : test_documents_chunked,
}

test_chk_coleridge_df = pd.DataFrame(test_chunked_dataset)
test_chk_coleridge_df = test_chk_coleridge_df.explode(['document'])
test_chk_coleridge_df

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (5285 > 512). Running this sequence through the model will result in indexing errors





Unnamed: 0,document
0,Cognitive deficits and reduced educational ach...
0,performance is associated with lower educatio...
0,with completion of college (rs11584700 and rs...
0,"et al., 2013) The next (and ongoing) wave of ..."
0,have genotypes for the SNPs of interest were ...
...,...
3,"grains, poultry and cured meats. In this sect..."
3,characteristics and locations of our sample o...
3,account for some of the differences in our fi...
3,shoppers in the NE region. We may also be wit...


In [21]:
pkl.dump(test_chk_coleridge_df, open('./data/dataset/test.pkl', 'wb'))

## 2 - Splitting Train & dev set

In [2]:
chk_coleridge_df = pkl.load(open('./data/dataset/chunked_coleridge_df.pkl', 'rb'))

In [3]:
answer = chk_coleridge_df['answer'].unique()
has_answer = chk_coleridge_df['has_answer'].to_numpy()

train_answer, dev_answer, _, _ = train_test_split(answer, answer, test_size = 0.2)
train_answer, test_answer, _, _ = train_test_split(train_answer, train_answer, test_size = 0.2)

In [13]:
train = chk_coleridge_df[chk_coleridge_df.answer.isin(train_answer)]
print(f'Number of texts that contain an answer : {len(train[train.has_answer == True])}')
print(f'Number of texts that do not contain an answer : {len(train[train.has_answer == False])}')

Number of texts that contain an answer : 29057
Number of texts that do not contain an answer : 319156


In [14]:
dev = chk_coleridge_df[chk_coleridge_df.answer.isin(dev_answer)]
print(f'Number of texts that contain an answer : {len(dev[dev.has_answer == True])}')
print(f'Number of texts that do not contain an answer : {len(dev[dev.has_answer == False])}')

Number of texts that contain an answer : 6819
Number of texts that do not contain an answer : 110384


In [15]:
test = chk_coleridge_df[chk_coleridge_df.answer.isin(test_answer)]
print(f'Number of texts that contain an answer : {len(test[test.has_answer == True])}')
print(f'Number of texts that do not contain an answer : {len(test[test.has_answer == False])}')

Number of texts that contain an answer : 3536
Number of texts that do not contain an answer : 57861


In [27]:
pkl.dump(train, open('./data/dataset/train.pkl', 'wb'))
pkl.dump(dev, open('./data/dataset/dev.pkl', 'wb'))
pkl.dump(test, open('./data/dataset/test.pkl', 'wb'))

## 3 - Downsampling FN

In [49]:
train = pkl.load(open('./data/dataset/train.pkl', 'rb'))
dev = pkl.load(open('./data/dataset/dev.pkl', 'rb'))
test = pkl.load(open('./data/dataset/test.pkl', 'rb'))

In [50]:
def downsampling_FN(dataframe) : 
    
    reset_df = dataframe.reset_index()
    FN = reset_df[reset_df.has_answer == False]
    TP = reset_df[reset_df.has_answer == True]
    
    n_true = len(TP)
    n_false = len(FN)
    
    keep_ratio = 1- (n_true/n_false)
    
    keep = [False if np.random.random_sample() < keep_ratio else True for k in range(n_false)]
    
    keep_index = FN.index.to_numpy()[keep]
    keep_index = np.concatenate((keep_index,TP.index.to_numpy()))
    
    downsample_df = reset_df[reset_df.index.isin(keep_index)]
    downsample_df.index = downsample_df['index']
    downsample_df = downsample_df.drop(columns = ['index'])
    
    return downsample_df
    

In [51]:
train_ds = downsampling_FN(train)
dev_ds = downsampling_FN(dev)
test_ds = downsampling_FN(test)

In [52]:
pkl.dump(train_ds, open('./data/dataset/train_ds.pkl', 'wb'))
pkl.dump(dev_ds, open('./data/dataset/dev_ds.pkl', 'wb'))
pkl.dump(test_ds, open('./data/dataset/test_ds.pkl', 'wb'))