# Preprocessing for PubMed Dataset into a Incremental manner

## 1. Load Dataset

In [4]:
import pandas as pd
train_df = pd.read_json('../datasets/pubmed-dataset-copy/train.json', lines=True)
val_df = pd.read_json('../datasets/pubmed-dataset-copy/val.json', lines=True)
test_df = pd.read_json('../datasets/pubmed-dataset-copy/test.json', lines=True)

## 2. Dataset Preprocessing

In [1]:
segmentation_keyword_table = {
    # Introduction and Literature
    'part_1': ['introduction', 'case', 'objectives', 'purposes', 
               'objective', 'purpose', 'background', 'literature',
               'aim', 'aims'],
    
    # Methods
    'part_2': ['material and methods',
               'materials and methods', 'methods', 'techniques', 'methodology',
               'materials', 'research design', 'study design'],
    
    # Results
    'part_3': ['result', 'results', 'experiments', 'observations'],
    
    # Discussion and Conlusion
    'part_4': ['discussion', 'limitation', 'conclusions', 
               'conclusion', 'concluding', 'comment', 'comments', 
               'summary', 'concluding remarks'],
}

In [5]:
def sec_join(df):
    # 每一行
    sample_list = []
    for sample in df['sections']:
        # 每一个sample的每一个section
        sections = []
        for section in sample: 
            temp = " ".join(section)
            sections.append(temp)
        
        sample_list.append(sections)
    
    return sample_list

In [6]:
for dataset in [train_df, val_df, test_df]:
    dataset['sections'] = sec_join(dataset)

In [7]:
# 首先需要删除labels列, 原数据集里面的labels列是空的
for dataset in [train_df, val_df, test_df]:
    dataset.drop(columns=['labels'], inplace=True)

In [8]:
# 删除有空值的行
for dataset in [train_df, val_df, test_df]:
    dataset.dropna(inplace=True)

In [9]:
train_df = train_df[train_df['article_text'].apply(lambda x: x != [""])]
val_df = val_df[val_df['article_text'].apply(lambda x: x != [""])]
test_df = test_df[test_df['article_text'].apply(lambda x: x != [""])]

In [10]:
train_df.iloc[0]['sections']

["a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively . the prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% . anthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight . snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states there are also some reports regarding school feeding programs in developing countries . in vietnam 

In [69]:
train = train_df.copy()
test = test_df.copy()
val = val_df.copy()

In [70]:
import re
from tqdm import tqdm

def keyword_matching_and_re_section(dataset):
    for idx, sample in tqdm(dataset.iterrows()):
        # Reset for each sample
        sections = [
            [], # part_1
            [], # part_2
            [], # part_3
            [], # part_4
        ]
        
        section_flag = [False] * len(sample['section_names'])
        
        for sec_num, (sec_name, sec) in enumerate(zip(sample['section_names'], sample['sections'])):
            for id, values in enumerate(segmentation_keyword_table.values()):
                if not section_flag[sec_num]:
                    # Use regular expression to check if section name contains any of the keywords
                    if any(keyword.lower() in sec_name.lower() for keyword in values):
                        sections[id].append(sec)
                        section_flag[sec_num] = True
        
        for id, value in enumerate(sections):
            sections[id] = " ".join(sections[id])

        dataset['sections'].loc[idx] = sections

In [71]:
for dataset in [train, val, test]:
    keyword_matching_and_re_section(dataset)

117112it [02:43, 714.38it/s]
6633it [00:01, 3734.46it/s]
6658it [00:01, 3771.17it/s]


In [72]:
train = train[train['sections'].apply(lambda x: all(s != '' for s in x))]
test = test[test['sections'].apply(lambda x: all(s != '' for s in x))]
val = val[val['sections'].apply(lambda x: all(s != '' for s in x))]

In [83]:
index = 0

In [84]:
test.iloc[index]['section_names']

['1. Introduction', '2. Methods', '3. Results', '4. Discussion']

In [85]:
test.iloc[index]['sections']

["anxiety affects quality of life in those living with parkinson 's disease ( pd ) more so than overall cognitive status , motor deficits , apathy , and depression [ 13 ] . although anxiety and depression are often related and coexist in pd patients , recent research suggests that anxiety rather than depression is the most prominent and prevalent mood disorder in pd [ 5 , 6 ] . yet , our current understanding of anxiety and its impact on cognition in pd , as well as its neural basis and best treatment practices , remains meager and lags far behind that of depression . overall , neuropsychiatric symptoms in pd have been shown to be negatively associated with cognitive performance . for example , higher depression scores have been correlated with lower scores on the mini - mental state exam ( mmse ) [ 8 , 9 ] as well as tests of memory and executive functions ( e.g. , attention ) [ 1014 ] . likewise , apathy and anhedonia in pd patients have been associated with executive dysfunction [ 1

In [79]:
train.to_json('../datasets/pubmed-dataset-incremental/train.json', orient='records', lines=True)
val.to_json('../datasets/pubmed-dataset-incremental/val.json', orient='records', lines=True)
test.to_json('../datasets/pubmed-dataset-incremental/test.json', orient='records', lines=True)

In [132]:
from datasets import load_dataset
import pandas as pd
train = pd.read_json('../datasets/pubmed-dataset-incremental/train.json', lines=True)
val = pd.read_json('../datasets/pubmed-dataset-incremental/val.json', lines=True)
test = pd.read_json('../datasets/pubmed-dataset-incremental/test.json', lines=True)

In [134]:
test.iloc[347]['abstract_text']

['<S> formaldehyde - fixed , paraffin - embedded ( ffpe ) tissue repositories \n represent a valuable resource for the retrospective study of disease \n progression and response to therapy . </S>',
 '<S> however , the proteomic analysis \n of ffpe tissues has been hampered by formaldehyde - induced protein \n modifications , which reduce protein extraction efficiency and may \n lead to protein misidentification . here , we demonstrate the use of \n heat augmented with high hydrostatic pressure ( 40,000 psi ) as a novel \n method for the recovery of intact proteins from ffpe mouse liver . \n </S>',
 '<S> when ffpe mouse liver was extracted using heat and elevated pressure , \n there was a 4-fold increase in protein extraction efficiency , a 3-fold \n increase in the extraction of intact proteins , and up to a 30-fold \n increase in the number of nonredundant proteins identified by mass \n spectrometry , compared to matched tissue extracted with heat alone . \n </S>',
 '<S> more importan

In [135]:
def remove_tag(dataset):
    for idx, sample in dataset.iterrows():
        # reset for each sample
        abstract_text = " ".join([text.replace('<S>', '').replace('</S>', '').strip() for text in sample['abstract_text']])
        dataset['abstract_text'].loc[idx] = abstract_text

In [136]:
for dataset in [train, val, test]:
    remove_tag(dataset)

In [139]:
test.iloc[224]['abstract_text']

'purposeour purpose was to systematically investigate the expression pattern and role of olig1 in neural cells during rat spinal cord development.animals and methodsspinal cord tissues were dissected from sprague  dawley rats at embryonic day 14.5 ( e14.5 ) and e18.5 , postnatal day 0 ( p0 ) , p3 , p7 , postnatal 2 weeks ( p2w ) , p4w , and adults ( more than 2 months after birth ) , respectively . the expression of olig1 was determined by western blot and immunostaining . to observe expression of olig1 in different neural cell types , a double immunohistochemical staining was performed using antibodies against olig1 with o4 , -tubulin , glial fibrillary acidic protein ( gfap ) , and myelin basic protein , respectively.resultsthe expression of olig1 protein shows a significant level change in rat spinal cord at different developmental time points . starting with e14.5 , the expression gradually increased and peaked at e18.5 . olig1 decreased gradually from p3 and reached its lowest lev

In [140]:
import nltk
import re
import wordninja
from tqdm import tqdm
nltk.download("punkt", quiet=True)

def sentences_split(dataset):
    for idx, sample in tqdm(dataset.iterrows()):
        # reset for each sample
        # 以.作为分割符，但是保留.
        abstract_text = re.split(r'(?<=\.)', sample['abstract_text'])
        # 移除列表中的空字符串
        abstract_text = [text.strip() for text in abstract_text if text]
        
        sample = []
        for sent in abstract_text:
            words = nltk.word_tokenize(sent)
            
            # 对每个单词应用wordninja，保留原始的标点符号
            split_words = [wordninja.split(word) if word.isalpha() else [word] for word in words]
            
            # 将嵌套的列表展平
            flat_split_words = [item for sublist in split_words for item in sublist]

            # 检查是否为标点符号，如果是就不用空格做连接
            sent = '' # .join(flat_split_words)
            for i, word in enumerate(flat_split_words):
                if i == 0 or i == len(flat_split_words) - 1 or re.match(r'^\W+$', word):
                    sent += word
                else:
                    sent += f' {word}'
            sample.append(sent)
            
        dataset.at[idx, 'abstract_text'] = sample

In [141]:
for dataset in [train, val, test]:
    sentences_split(dataset)

50248it [08:36, 97.30it/s] 
2794it [00:28, 96.50it/s] 
2913it [00:30, 96.75it/s] 


In [142]:
test.iloc[224]['abstract_text']

['purpose our purpose was to systematically investigate the expression pattern and role of olig1 in neural cells during rat spinal cord development.',
 'animals and methods spinal cord tissues were dissected from s prague daw ley rats at embryonic day 14.',
 '5( e14.',
 '5) and e18.',
 '5, postnatal day 0( p0), p3, p7, postnatal 2 weeks( p2w), p4w, and adults( more than 2 months after birth), respectively.',
 'the expression of olig1 was determined by western blot and immuno staining.',
 'to observe expression of olig1 in different neural cell types, a double immuno his to chemical staining was performed using antibodies against olig1 with o4, -tubulin, glial fi brill ary acidic protein( gfa p), and myelin basic protein, respectively.',
 'results the expression of olig1 protein shows a significant level change in rat spinal cord at different developmental time points.',
 'starting with e14.',
 '5, the expression gradually increased and peaked at e18.',
 '5.',
 'olig1 decreased graduall

In [143]:
segmentation_keyword_table = {
    # Introduction and Literature
    'part_1': ['introduction', 'case', 'objectives', 'purposes', 
               'objective', 'purpose', 'background', 'literature',
               'aim', 'aims'],
    
    # Methods
    'part_2': ['material and methods',
               'materials and methods', 'methods', 'techniques', 'methodology',
               'materials', 'research design', 'study design'],
    
    # Results
    'part_3': ['result', 'results', 'experiments', 'observations'],
    
    # Discussion and Conlusion
    'part_4': ['discussion', 'limitation', 'conclusions', 
               'conclusion', 'concluding', 'comment', 'comments', 
               'summary', 'concluding remarks'],
}

In [144]:
import re
from tqdm import tqdm

def keyword_matching_and_re_abstract(dataset):
    for idx, sample in tqdm(dataset.iterrows()):
        # Reset for each sample
        abstract_parts = [[] for _ in range(4)]
        current_part = 0  # Initialize to part_1

        for abs in sample['abstract_text']:
            # Splitting the abstract_text into 4 parts
            if abs != "":
                match = re.search(r'\b(\S+)\b', abs)
                if match:
                    first_word = match.group(1).lower()
                else:
                    # 处理没有匹配到单词的情况
                    # print(f'No word matched in {abs}')
                    first_word = ""
            else:
                # 处理空字符串的情况
                # print('Empty abstract_text')
                first_word = ""

            # Check if the current sentence contains the keyword for the next part
            for id, values in enumerate(segmentation_keyword_table.values()):
                if any(keyword.lower() in first_word for keyword in values):
                    # Move to the next part
                    current_part = id

            # Append the current sentence to the corresponding part
            abstract_parts[current_part].append(abs)

        # Joining the parts
        for id, value in enumerate(abstract_parts):
            abstract_parts[id] = " ".join(abstract_parts[id])

        dataset['abstract_text'].loc[idx] = abstract_parts
        # print(f'{dataset["abstract_text"].iloc[idx]=}')

In [145]:
for dataset in [train, val, test]:
    keyword_matching_and_re_abstract(dataset)

50248it [00:29, 1686.78it/s]
2794it [00:00, 2976.36it/s]
2913it [00:00, 2969.31it/s]


In [146]:
train = train[train['abstract_text'].apply(lambda x: all(s != '' for s in x))]
val = val[val['abstract_text'].apply(lambda x: all(s != '' for s in x))]
test = test[test['abstract_text'].apply(lambda x: all(s != '' for s in x))]

In [147]:
test.iloc[346]['abstract_text']

['background two hepatitis e virus( he v) outbreaks occurred in algeria( 1979- 1980 and 1987- 1988). however, to date, no study on the prevalence of anti- he v antibodies has been conducted in algeria, and the genotype of the circulating strains remains unclear. objectives this study was conducted to investigate the presence of anti- he v antibodies among outpatients and blood donors in three different hospitals in northern algeria and to determine the genotype of the circulating strains through the characterization of the immuno reactivity of anti- he v antibodies.',
 'methods a total of 590 blood samples( 379 from blood donors and 211 from outpatients) were collected in three health facilities in northern algeria and assessed for anti- he v antibodies using an in- house double- antigen sandwich immuno as say. he v open reading frame 2 recombinant proteins p166( a a 452- 617) generated from the four he v genotypes were used as antigens. the genotype of the strains circulating in alger

In [148]:
def incremental_concat(dataset):
    for idx, sample in tqdm(dataset.iterrows()):
        for id, value in enumerate(sample['abstract_text']):
            if id != 0:
                sample['abstract_text'][id] = sample['abstract_text'][id-1] + " " + sample['abstract_text'][id]

In [149]:
for dataset in [train, val, test]:
    incremental_concat(dataset)

24816it [00:01, 12928.16it/s]
1393it [00:00, 13117.18it/s]
1430it [00:00, 13072.35it/s]


In [151]:
index = 11235

In [152]:
train.iloc[index]['abstract_text']

['purpose self- regulatory processes play an important role in mediating between the disease and the health outcomes, and potentially also work outcomes. this systematic review aims to explore the relationship between illness perceptions and work participation in patients with somatic diseases and complaints.',
 'purpose self- regulatory processes play an important role in mediating between the disease and the health outcomes, and potentially also work outcomes. this systematic review aims to explore the relationship between illness perceptions and work participation in patients with somatic diseases and complaints. methods the bibliographic databases med line, psy c info and em base were searched from inception to march 2008. included were cross- sectional or longitudinal studies, patients with somatic diseases or complaints, illness perceptions based on at least four dimensions of the common sense model of self- regulation, and work participation.',
 'purpose self- regulatory process

In [153]:
train.to_json('../datasets/pubmed-dataset-incremental/train.json', orient='records', lines=True)
val.to_json('../datasets/pubmed-dataset-incremental/val.json', orient='records', lines=True)
test.to_json('../datasets/pubmed-dataset-incremental/test.json', orient='records', lines=True)

In [154]:
import pandas as pd 

train = pd.read_json('../datasets/pubmed-dataset-incremental/train.json', lines=True)
val = pd.read_json('../datasets/pubmed-dataset-incremental/val.json', lines=True)
test = pd.read_json('../datasets/pubmed-dataset-incremental/test.json', lines=True)

In [156]:
train.iloc[0]['sections']

["a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively . the prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% . anthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight . snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states there are also some reports regarding school feeding programs in developing countries . in vietnam 

In [157]:
import nltk
from tqdm import tqdm
nltk.download("punkt", quiet=True)

True

In [158]:
# calculate the avg token size of dataset['sections']:
def avg_token_size(dataset):
    token_size = 0
    for idx, sample in tqdm(dataset.iterrows()):
        for sec in sample['sections']:
            token_size += len(nltk.word_tokenize(sec))
    return token_size / len(dataset)

In [159]:
for dataset in [train, val, test]:
    print(avg_token_size(dataset))

24816it [09:13, 44.82it/s]


2713.747018052869


1393it [00:31, 44.44it/s]


2730.2605886575734


1430it [00:31, 45.03it/s]

2705.4384615384615





In [None]:
# calculate the avg token size of dataset['abstract_text']:
def avg_token_size_abstract(dataset):
    token_size = 0
    for idx, sample in tqdm(dataset.iterrows()):
        for abs in sample['abstract_text']:
            token_size += len(nltk.word_tokenize(abs))
    return token_size / len(dataset)

In [None]:
for dataset in [train, val, test]:
    print(avg_token_size_abstract(dataset))