In [1]:
# !pip install spacy-transformers

In [1]:
import spacy
import ast
import pandas as pd
import json

In [2]:
nlp = spacy.load("en_core_web_trf")

# Function for tokenising with space as token

In [3]:
#tokenize text

def tokenize(raw):
    doc = nlp(raw)
    token_texts = []
    for token in doc:
        token_texts.append(token.text)
        if token.whitespace_:  # filter out empty strings
            token_texts.append(token.whitespace_)
    return token_texts

# Function for obtaining pos tags of token

In [4]:
def pos(raw):
    doc = nlp(raw)
    
    pos_dict = {}
    
    for token in doc:
        pos_dict[str(token)] = str(token.pos_)
        
    return pos_dict

# Function for obtaining index of tokens

In [5]:
def get_tok_idx(tokens):
    token_dict = {}
    for i in range(len(tokens)):
        if i == 0:
            idx_start = 0
            idx_end = len(tokens[i])
        else:
            previous_tok_len = 0
            for j in range(i):
                previous_tok_len += len(tokens[j])
            idx_start = previous_tok_len
            idx_end = previous_tok_len + len(tokens[i])
          
        token_dict[str([idx_start,idx_end])]= tokens[i]
    return token_dict

# Illustrative example

In [6]:
def json2dataframe(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
        return pd.DataFrame(json_data)

In [29]:
#read raw PLOD tsv file
f_name = '../PLODv1/unfiltered_data/PLOS-test15-unfiltered'
data = pd.read_csv(f_name, sep="\t", quoting=3, encoding='utf8')
data.shape

(24399, 11)

In [30]:
#uncomment the line below to read and convert SDU@AAAI-22-SharedTask to a dataframe
#data = pd.concat([json2dataframe("../SDU/train.json"), json2dataframe("../SDU/dev.json")], ignore_index=True)

In [31]:
data.head()

Unnamed: 0,IDs,text,ac_indexes,lf_indexes,ac_text,lf_text,source_file,source_journal,auto_valid,ac_dic,lf_dic
0,1183268,The diffusing capacity of the lungs for carbon...,"[[57, 61], [102, 105], [135, 138], [94, 96], [...","[[4, 55]]","['DLCO', 'IPF', 'IPF', 'LC', 'LC']",['diffusing capacity of the lungs for carbon m...,PMC7323957.nxml,PLoS_One,0,"['DLCO', 'IPF', 'IPF', 'LC', 'LC']",['diffusing capacity of the lungs for carbon m...
1,510643,The unadjusted prevalence of blindness in the ...,"[[113, 115], [128, 132]]","[[92, 111]]","['CI', 'DEFF']",['confidence interval'],PMC1904464.nxml,PLoS_Med,0,"['CI', 'DEFF']",['confidence interval']
2,955227,Effects of 25 μM of resveratrol (RSV) on the e...,"[[33, 36], [115, 117], [155, 157], [198, 200]]","[[84, 101], [120, 137], [163, 180], [84, 113],...","['RSV', '3S', '3G', '4G']","['trans-resveratrol', 'trans-resveratrol', 'tr...",PMC5617156.nxml,PLoS_One,0,"['RSV', '3S', '3G', '4G']","['Trans-resveratrol', 'Trans-resveratrol', 'Tr..."
3,1137321,"For evaluating depressive symptoms, the self-c...","[[82, 85], [191, 195], [324, 330]]","[[55, 80], [256, 322]]","['BDI', 'MINI', 'DSM-IV']","['Beck Depression Inventory', 'Diagnostic and ...",PMC6936775.nxml,PLoS_One,0,"['BDI', 'MINI', 'DSM-IV']","['Beck Depression Inventory', 'Diagnostic and ..."
4,249100,Filamentous actin (F-actin) was stained with p...,"[[100, 103], [19, 26]]","[[0, 17]]","['CFm', 'F-actin']",['Filamentous actin'],PMC5562439.nxml,PLoS_Biol,0,"['CFm', 'F-actin']",['filamentous actin']


In [32]:
data['tokenized'] =data['text'].apply(tokenize)

In [33]:
data['pos_dict'] = data['text'].apply(pos)

In [34]:
data['tokens_dict'] = data['tokenized'].apply(get_tok_idx)

In [35]:
# change the name of longforms and acronyms column for applying the function directly on the column
data = data.rename(columns={"lf_indexes": "longforms"})
#data = data.rename(columns={"long-forms": "longforms"}) #for SDU@AAAI-22-SharedTask
data = data.rename(columns={"ac_indexes": "acronyms"})
data.head()

Unnamed: 0,IDs,text,acronyms,longforms,ac_text,lf_text,source_file,source_journal,auto_valid,ac_dic,lf_dic,tokenized,pos_dict,tokens_dict
0,1183268,The diffusing capacity of the lungs for carbon...,"[[57, 61], [102, 105], [135, 138], [94, 96], [...","[[4, 55]]","['DLCO', 'IPF', 'IPF', 'LC', 'LC']",['diffusing capacity of the lungs for carbon m...,PMC7323957.nxml,PLoS_One,0,"['DLCO', 'IPF', 'IPF', 'LC', 'LC']",['diffusing capacity of the lungs for carbon m...,"[The, , diffusing, , capacity, , of, , the...","{'The': 'DET', 'diffusing': 'VERB', 'capacity'...","{'[0, 3]': 'The', '[3, 4]': ' ', '[4, 13]': 'd..."
1,510643,The unadjusted prevalence of blindness in the ...,"[[113, 115], [128, 132]]","[[92, 111]]","['CI', 'DEFF']",['confidence interval'],PMC1904464.nxml,PLoS_Med,0,"['CI', 'DEFF']",['confidence interval'],"[The, , unadjusted, , prevalence, , of, , ...","{'The': 'DET', 'unadjusted': 'ADJ', 'prevalenc...","{'[0, 3]': 'The', '[3, 4]': ' ', '[4, 14]': 'u..."
2,955227,Effects of 25 μM of resveratrol (RSV) on the e...,"[[33, 36], [115, 117], [155, 157], [198, 200]]","[[84, 101], [120, 137], [163, 180], [84, 113],...","['RSV', '3S', '3G', '4G']","['trans-resveratrol', 'trans-resveratrol', 'tr...",PMC5617156.nxml,PLoS_One,0,"['RSV', '3S', '3G', '4G']","['Trans-resveratrol', 'Trans-resveratrol', 'Tr...","[Effects, , of, , 25, , μM, , of, , resve...","{'Effects': 'NOUN', 'of': 'ADP', '25': 'NUM', ...","{'[0, 7]': 'Effects', '[7, 8]': ' ', '[8, 10]'..."
3,1137321,"For evaluating depressive symptoms, the self-c...","[[82, 85], [191, 195], [324, 330]]","[[55, 80], [256, 322]]","['BDI', 'MINI', 'DSM-IV']","['Beck Depression Inventory', 'Diagnostic and ...",PMC6936775.nxml,PLoS_One,0,"['BDI', 'MINI', 'DSM-IV']","['Beck Depression Inventory', 'Diagnostic and ...","[For, , evaluating, , depressive, , symptom...","{'For': 'ADP', 'evaluating': 'VERB', 'depressi...","{'[0, 3]': 'For', '[3, 4]': ' ', '[4, 14]': 'e..."
4,249100,Filamentous actin (F-actin) was stained with p...,"[[100, 103], [19, 26]]","[[0, 17]]","['CFm', 'F-actin']",['Filamentous actin'],PMC5562439.nxml,PLoS_Biol,0,"['CFm', 'F-actin']",['filamentous actin'],"[Filamentous, , actin, , (, F, -, actin, ), ...","{'Filamentous': 'ADJ', 'actin': 'NOUN', '(': '...","{'[0, 11]': 'Filamentous', '[11, 12]': ' ', '[..."


In [36]:
#uncomment the below two lines to convert list into string to unify data types of different files
#data["longforms"] = data["longforms"].apply(str)
#data["acronyms"] = data["acronyms"].apply(str)

# The function for annotating with BIO and POS

In [15]:
def pos_bio(tokens_dict, longforms, acronyms, pos_dict):
    sent_labeled = []
    unannotated = {}
    lf_counter = 0
    abbr_counter = 0
    for k, v in tokens_dict.items():
        label_flag = 0 #used for labeling non arconym and longforms
        if v.isspace() != True:
            #check longforms
            for idx in sorted(ast.literal_eval(longforms)):
                if idx[0] == ast.literal_eval(k)[0]:
                    #check if start index equal
                    sent_labeled.append((v, pos_dict[v],'B-LF'))
                    label_flag = 1
                    lf_counter += 1
                    break

                elif ast.literal_eval(k)[0] > idx[0] and ast.literal_eval(k)[0] < idx[1]:
                    #check I-LF
                    if(v not in pos_dict.keys()):
                        pos_dict[v] = 'PUNCT'
                    sent_labeled.append((v, pos_dict[v],'I-LF'))
                    label_flag = 1
                    break

                elif ast.literal_eval(k)[1] > idx[0] and ast.literal_eval(k)[1] < idx[1]:
                    #check if B-LF end index falls insde LF index
                    sent_labeled.append((v, pos_dict[v],'B-LF'))
                    label_flag = 1
                    lf_counter += 1
                    break

            if label_flag == 0:
                # check acronyms
                for idx in ast.literal_eval(acronyms):
                    #check if acronyms fall inside token
                    if idx[0] >= ast.literal_eval(k)[0] and idx[1] <= ast.literal_eval(k)[1]:
                        sent_labeled.append((v, pos_dict[v],'B-AC'))
                        label_flag = 1
                        abbr_counter += 1
                        break
                    #check if token fall inside acronyms
                    elif ast.literal_eval(k)[0] == idx[0] and ast.literal_eval(k)[1] < idx[1]:
                        sent_labeled.append((v, pos_dict[v],'B-AC'))
                        label_flag = 1
                        abbr_counter += 1
                        break
                    elif ast.literal_eval(k)[0] > idx[0] and ast.literal_eval(k)[1] <= idx[1]:
                        sent_labeled.append((v, pos_dict[v],'B-AC'))
                        label_flag = 1
                        break

            #check O label
            if label_flag == 0:
                if(v not in pos_dict.keys()):
                    pos_dict[v] = 'PUNCT'
                sent_labeled.append((v,pos_dict[v],'B-O'))

    if len(ast.literal_eval(longforms)) != lf_counter:
        unannotated["unmatched long forms"] = str(len(ast.literal_eval(longforms))) + "-" + str(lf_counter)
    if len(ast.literal_eval(acronyms)) != abbr_counter:
        unannotated["unmatched acronyms"] = str(len(ast.literal_eval(acronyms))) + "-" + str(abbr_counter)

    return sent_labeled, unannotated

In [16]:
data.shape

(113860, 14)

# Apply the function to the dataframe directly


In [37]:
plos_data_pos_bio = data.apply(lambda x: pos_bio(x.tokens_dict, x.longforms,x.acronyms, x.pos_dict), axis=1)

# Save the POS, BIO list into a txt file with each token, POS, and BIO annotation on one line

In [18]:
plos_data_pos_bio[3]

([('We', 'PRON', 'B-O'),
  ('developed', 'VERB', 'B-O'),
  ('a', 'DET', 'B-O'),
  ('variant', 'NOUN', 'B-O'),
  ('of', 'ADP', 'B-O'),
  ('gene', 'NOUN', 'B-LF'),
  ('set', 'NOUN', 'I-LF'),
  ('enrichment', 'NOUN', 'I-LF'),
  ('analysis', 'NOUN', 'I-LF'),
  ('(', 'PUNCT', 'B-O'),
  ('GSEA', 'PROPN', 'B-AC'),
  (')', 'PUNCT', 'B-O'),
  ('to', 'PART', 'B-O'),
  ('determine', 'VERB', 'B-O'),
  ('whether', 'SCONJ', 'B-O'),
  ('a', 'DET', 'B-O'),
  ('genetic', 'ADJ', 'B-O'),
  ('pathway', 'NOUN', 'B-O'),
  ('shows', 'VERB', 'B-O'),
  ('evidence', 'NOUN', 'B-O'),
  ('for', 'ADP', 'B-O'),
  ('age', 'NOUN', 'B-O'),
  ('regulation', 'NOUN', 'B-O'),
  ('[', 'PUNCT', 'B-O'),
  ('23', 'NUM', 'B-O'),
  (']', 'PUNCT', 'B-O'),
  ('.', 'PUNCT', 'B-O')],
 {})

In [38]:
output_pos_bio = f_name + "bio.conll"
with open(output_pos_bio, 'w', encoding="utf-8") as outfile:
    for ls in plos_data_pos_bio:
        for tup in ls[0]:
            line = " ".join(map(str, tup))
        
            outfile.write(line + '\n')
        outfile.write('\n')