In [1]:
#We extract the pubmed document in BioCJSON format
import urllib3
import json
import csv
import requests
import pandas as pd
#import nltk
#nltk.download('punkt')
#from nltk.tokenize import word_tokenize
from itertools import combinations

pmcid = 'PMC2837563'

http = urllib3.PoolManager()

r = http.request('GET', f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmcids={pmcid}')
data = json.loads(r.data.decode('utf-8'))
#data = json.dumps(data, indent=4)

In [2]:
output = json.dumps(data, indent=4)
#print(output)

'''
Every PMC id has passages.
Every passage has many {infons, offset, text, sentences, annotations, relations}.
Here, text is the actual text we have to annotate. 
Every annotations has {id, infons, text, locations}. 
Here infons has {identifier, type} (optional: ncbi-homologene if type is gene).
Also locations has {offset, length}.

Passages:
    a)infons - data realted article id, author, etc..
    b)offset - location index
    c)text - whole medical data (sentence) in which medical terms (gene name or disease name) are to be annotated.
    d)sentences - not required here
    e)annotations:
        1)id - key index
        2)infons:
              a)identifier
              b)type - "Gene" or "Disease" etc.,
        3)text - gene name or disease name etc., (Eg: "K-Ras")
        4)locations:
            a)offset - location index
            b)length - length of text. (Eg: len("tumours") = 7) 
    f)relations
'''




'\nEvery PMC id has passages.\nEvery passage has many {infons, offset, text, sentences, annotations, relations}.\nHere, text is the actual text we have to annotate. \nEvery annotations has {id, infons, text, locations}. \nHere infons has {identifier, type} (optional: ncbi-homologene if type is gene).\nAlso locations has {offset, length}.\n\nPassages:\n    a)infons - data realted article id, author, etc..\n    b)offset - location index\n    c)text - whole medical data (sentence) in which medical terms (gene name or disease name) are to be annotated.\n    d)sentences - not required here\n    e)annotations:\n        1)id - key index\n        2)infons:\n              a)identifier\n              b)type - "Gene" or "Disease" etc.,\n        3)text - gene name or disease name etc., (Eg: "K-Ras")\n        4)locations:\n            a)offset - location index\n            b)length - length of text. (Eg: len("tumours") = 7) \n    f)relations\n'

In [3]:
df1 = pd.DataFrame(columns=['index','sentence'])
df2 = pd.DataFrame(columns=['index','sentence','entity_1','entity_2'])

**Annotations for Gene and Mutation**

In [4]:
sentence_index = 0
sentence_entities = {}
for i in data['passages']:
  if i['infons']['section_type'] != 'TABLE':
    #filter the table segment
    text = i['text']
    # print("TEXT: ", text)
    offset = i['offset']
    # print("OFFSET: ", offset)    
    annotations = i['annotations']
    
    annotations = sorted(annotations, key = lambda x: x['locations'][0]['offset'])
    # print("ANNOTATIONS: ", annotations)
    #Filter to only include gene-disease annotations
    annotations = [annotation for annotation in annotations if ((annotation['infons']['type']=='Gene') or (annotation['infons']['type']=='Mutation'))]
    #List all possible combinations of annotations
    annots_combinations = list(combinations(annotations, 2))
    # print("ANNOT COMBIMNATIONS: ", annots_combinations)
    #Filter combinations to only include gene-disease combinations
    annots_combinations = [annots for annots in annots_combinations if annots[0]['infons']['type'] != annots[1]['infons']['type']]
    # print("ANNOT COMBINATIONS - GENE-DISEASE: ", annots_combinations)

    #processing sentences
    sentences = text.split('. ')
    # print(sentences)
    sentence_offset = {}
    sentence_len = {}
    prev_sent_offset = offset

    for sentence in sentences:

      sentence_offset[sentence] = prev_sent_offset
      current_sentence_offset = prev_sent_offset
      sentence_len[sentence] = len(sentence)
      current_sentence_len = len(sentence)
      prev_sent_offset += len(sentence) + 2
      

      #Point to note: Duplicate sentences for as many combinations as present. 
      for annots in annots_combinations:
        difference = 0
        current_sentence = sentence
        #sort the tuple
        annots = sorted(annots, key = lambda x: x['locations'][0]['offset'])

        entity_1 = annots[0]
        entity_2 = annots[1]
        
        entity_1_offset = entity_1['locations'][0]['offset']
        entity_2_offset = entity_2['locations'][0]['offset']

        entity_1_dist = entity_1_offset - current_sentence_offset
        entity_2_dist = entity_2_offset - current_sentence_offset

        if (0 <= entity_1_dist <= ((current_sentence_len - len(entity_1['text'])) + 1)) and (0 <= entity_2_dist <= ((current_sentence_len - len(entity_2['text'])) + 1)):
          #the pair of annotations fall within the sentence
          sentence_entities[sentence_index] = (entity_1['text'], entity_2['text'])

          entity_1_type = entity_1['infons']['type']
          entity_1_length = entity_1['locations'][0]['length']
          temp = '@'+ entity_1_type +'$'
          entity_1_final_off = entity_1_dist 
          current_sentence = current_sentence[:entity_1_final_off] + "@" + entity_1_type + "$" + current_sentence[(entity_1_final_off + entity_1_length):]
          difference += (entity_1_length - len(temp))
          entity_2_type = entity_2['infons']['type']
          entity_2_length = entity_2['locations'][0]['length']
          temp = '@'+ entity_2_type +'$'
          entity_2_final_off = entity_2_dist - (difference)
          current_sentence = current_sentence[:entity_2_final_off] + "@" + entity_2_type + "$" + current_sentence[(entity_2_final_off + entity_2_length):]
          difference += (entity_2_length - len(temp))
          # tsv_writer_1.writerow([sentence_index, current_sentence])
          row1 = [sentence_index,current_sentence]
          df1.loc[len(df1)] = row1
          if (entity_1['infons']['type'] == 'Gene'):
            # print('Writing...', [sentence_index, sentence, entity_1['text'], entity_2['text']])
            # tsv_writer_2.writerow([sentence_index, sentence, entity_1['text'], entity_2['text']])
            row2 = [sentence_index, sentence, entity_1['text'], entity_2['text']]
            df2.loc[len(df2)] = row2
          else:
            # print('Writing')
            # tsv_writer_2.writerow([sentence_index, sentence, entity_2['text'], entity_1['text']])
            row2 = [sentence_index, sentence, entity_2['text'], entity_1['text']]
            df2.loc[len(df2)] = row2
          sentence_index += 1

In [5]:
import os
os.chdir('../biobert-pytorch/relation-extraction')

In [6]:
df2.to_csv('pub_original_sentences.tsv', sep="\t", index=False)
df1.to_csv('input/test.tsv', sep="\t", index=False)

In [28]:
df = pd.read_csv('pub_original_sentences.tsv', sep="\t")
df.columns

Index(['index', 'sentence', 'entity_1', 'entity_2'], dtype='object')

**BioBert Training**

In [32]:
import os
print(os.getcwd())
os.chdir('..')

/home/skirupa/Desktop/sem8/RSL-Lab/RSLLab-20230318T070434Z-001/RSLLab/biobert-pytorch/relation-extraction


In [4]:
# Download all datasets including NER/RE/QA
!bash ./download.sh

BIOBERT_DATA not set; downloading to default path ('data').
--2023-03-18 13:34:45--  http://nlp.dmis.korea.edu/projects/biobert-2020-checkpoints/datasets.tar.gz
Resolving nlp.dmis.korea.edu (nlp.dmis.korea.edu)... 163.152.163.168
Connecting to nlp.dmis.korea.edu (nlp.dmis.korea.edu)|163.152.163.168|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29610233 (28M) [application/x-gzip]
Saving to: ‘./data.tar.gz’


2023-03-18 13:35:56 (413 KB/s) - ‘./data.tar.gz’ saved [29610233/29610233]

datasets/
datasets/RE/
datasets/RE/GAD/
datasets/RE/GAD/6/
datasets/RE/GAD/6/test.tsv
datasets/RE/GAD/6/dev.tsv
datasets/RE/GAD/6/train.tsv
datasets/RE/GAD/7/
datasets/RE/GAD/7/test.tsv
datasets/RE/GAD/7/dev.tsv
datasets/RE/GAD/7/train.tsv
datasets/RE/GAD/5/
datasets/RE/GAD/5/test.tsv
datasets/RE/GAD/5/dev.tsv
datasets/RE/GAD/5/train.tsv
datasets/RE/GAD/8/
datasets/RE/GAD/8/test.tsv
datasets/RE/GAD/8/dev.tsv
datasets/RE/GAD/8/train.tsv
datasets/RE/GAD/4/
datasets/RE/GAD/4/test.tsv


In [14]:
os.chdir('../relation-extraction')

#To preprocess the datasets downloaded
!bash ./preprocess.sh

*****  euadr  Preprocessing Start *****
*****  euadr  Preprocessing Done *****
*****  GAD  Preprocessing Start *****
*****  GAD  Preprocessing Done *****


In [6]:
!pip install scikit-learn
!pip install pandas

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp36-cp36m-manylinux2010_x86_64.whl (22.2 MB)
[K     |████████████████████████████████| 22.2 MB 85 kB/s  eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=0.19.1
  Downloading scipy-1.5.4-cp36-cp36m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 51 kB/s eta 0:00:013     |████████████████                | 12.9 MB 751 kB/s eta 0:00:18
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-0.24.2 scipy-1.5.4 threadpoolctl-3.1.0
Collecting pandas
  Downloading pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 7.4 kB/s eta 0:00:01     |██████████████▋                 | 4.3 MB 3.5 MB/s eta 0:00:02
Collecting pytz>=2017.2
  Downloading pytz-2022.7.1-py2.py3-none-any.whl (499 kB)
[K     |████████████████████████████████| 499 kB 1

In [10]:
#!pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-1.10.2-cp36-cp36m-manylinux1_x86_64.whl (881.9 MB)
[K     |████████████████████████████████| 881.9 MB 3.0 kB/s eta 0:00:011   |▌                               | 13.9 MB 1.5 MB/s eta 0:09:56     |█▎                              | 35.8 MB 3.5 MB/s eta 0:04:03     |███                             | 80.6 MB 897 kB/s eta 0:14:53     |██████████████▎                 | 393.6 MB 1.8 MB/s eta 0:04:25     |███████████████▌                | 425.9 MB 1.9 MB/s eta 0:04:03     |████████████████▌               | 454.2 MB 475 kB/s eta 0:15:00     |██████████████████▌             | 509.5 MB 2.0 MB/s eta 0:03:11     |███████████████████▉            | 548.0 MB 1.7 MB/s eta 0:03:18     |████████████████████▏           | 557.0 MB 2.5 MB/s eta 0:02:11     |█████████████████████▊          | 597.4 MB 2.0 MB/s eta 0:02:26     |███████████████████████████▉    | 766.6 MB 1.8 MB/s eta 0:01:05     |███████████████████████████████ | 853.7 MB 1.0 MB/s eta 0:00:28
[?25hCollectin

In [15]:
%env SAVE_DIR=./output
%env DATA="GAD"
%env SPLIT="1"
%env DATA_DIR=./input
%env ENTITY=${DATA}-${SPLIT}

%env MAX_LENGTH=128
%env BATCH_SIZE=32
%env NUM_EPOCHS=3
%env SAVE_STEPS=1000
%env SEED=1

env: SAVE_DIR=./output
env: DATA="GAD"
env: SPLIT="1"
env: DATA_DIR=./input
env: ENTITY=${DATA}-${SPLIT}
env: MAX_LENGTH=128
env: BATCH_SIZE=32
env: NUM_EPOCHS=3
env: SAVE_STEPS=1000
env: SEED=1


In [16]:
!python run_re.py --task_name SST-2 --config_name bert-base-cased --model_name_or_path dmis-lab/biobert-base-cased-v1.1 \
        --do_predict --data_dir ${DATA_DIR} \
        --output_dir ${SAVE_DIR} \
        --overwrite_output_dir


03/18/2023 15:04:59 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='./output', overwrite_output_dir=True, do_train=False, do_eval=False, do_predict=True, evaluate_during_training=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Mar18_15-04-52_skirupa', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name='.

In [30]:
original_sentences = pd.read_csv('pub_original_sentences.tsv', sep="\t")
predictions = pd.read_csv('output/test_results.txt', sep='\t')

#df = pd.read_csv('pub_original_sentences.tsv', sep="\t")
#df.columns

In [31]:
#Merge the pub original sentences and test results (prediction of biobert model)

final_re_output = pd.merge(original_sentences, predictions, on ='index', how='left')
final_re_output

Unnamed: 0,index,sentence,entity_1,entity_2,prediction
0,0,Four additional K-Ras mutations (Leu19Phe (1 o...,K-Ras,Leu19Phe,1
1,1,Four additional K-Ras mutations (Leu19Phe (1 o...,K-Ras,Lys117Asn,1
2,2,Four additional K-Ras mutations (Leu19Phe (1 o...,K-Ras,Ala146Thr,1
3,3,Four additional K-Ras mutations (Leu19Phe (1 o...,K-Ras,Arg164Gln,1
4,4,Lys117Asn and Ala146Thr had phenotypes similar...,K-Ras,Lys117Asn,1
...,...,...,...,...,...
115,115,"(B) The transforming potential of L19F, K117N,...",K-Ras,A146T,1
116,116,"(B) The transforming potential of L19F, K117N,...",K-Ras,R164Q,1
117,117,"(B) The transforming potential of L19F, K117N,...",K-Ras,G12V,1
118,118,The K-Ras G12V construct was included as a pos...,K-Ras,G12V,1
