In [1]:
#We extract the pubmed document in BioCJSON format
import urllib3
import json
import csv
import requests
import pandas as pd
from itertools import combinations

pmcid = 'PMC2837563'

http = urllib3.PoolManager()

r = http.request('GET', f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmcids={pmcid}')
data = json.loads(r.data.decode('utf-8'))
#data = json.dumps(data, indent=4)

In [2]:
output = json.dumps(data, indent=4)
#print(output)

'''
Every PMC id has passages.
Every passage has many {infons, offset, text, sentences, annotations, relations}.
Here, text is the actual text we have to annotate. 
Every annotations has {id, infons, text, locations}. 
Here infons has {identifier, type} (optional: ncbi-homologene if type is gene).
Also locations has {offset, length}.

Passages:
    a)infons - data realted article id, author, etc..
    b)offset - location index
    c)text - whole medical data (sentence) in which medical terms (gene name or disease name) are to be annotated.
    d)sentences - not required here
    e)annotations:
        1)id - key index
        2)infons:
              a)identifier
              b)type - "Gene" or "Disease" etc.,
        3)text - gene name or disease name etc., (Eg: "K-Ras")
        4)locations:
            a)offset - location index
            b)length - length of text. (Eg: len("tumours") = 7) 
    f)relations
'''




'\nEvery PMC id has passages.\nEvery passage has many {infons, offset, text, sentences, annotations, relations}.\nHere, text is the actual text we have to annotate. \nEvery annotations has {id, infons, text, locations}. \nHere infons has {identifier, type} (optional: ncbi-homologene if type is gene).\nAlso locations has {offset, length}.\n\nPassages:\n    a)infons - data realted article id, author, etc..\n    b)offset - location index\n    c)text - whole medical data (sentence) in which medical terms (gene name or disease name) are to be annotated.\n    d)sentences - not required here\n    e)annotations:\n        1)id - key index\n        2)infons:\n              a)identifier\n              b)type - "Gene" or "Disease" etc.,\n        3)text - gene name or disease name etc., (Eg: "K-Ras")\n        4)locations:\n            a)offset - location index\n            b)length - length of text. (Eg: len("tumours") = 7) \n    f)relations\n'

In [3]:
df1 = pd.DataFrame(columns=['index','sentence'])
df2 = pd.DataFrame(columns=['index','sentence','Gene','Mutation'])

**Annotations for Gene and Mutation**

In [4]:
sentence_index = 0
sentence_entities = {}
for i in data['passages']:
  if i['infons']['section_type'] != 'TABLE':
    #filter the table segment
    text = i['text']
    # print("TEXT: ", text)
    offset = i['offset']
    # print("OFFSET: ", offset)    
    annotations = i['annotations']
    
    annotations = sorted(annotations, key = lambda x: x['locations'][0]['offset'])
    # print("ANNOTATIONS: ", annotations)
    #Filter to only include gene-disease annotations
    annotations = [annotation for annotation in annotations if ((annotation['infons']['type']=='Gene') or (annotation['infons']['type']=='Mutation'))]
    #List all possible combinations of annotations
    annots_combinations = list(combinations(annotations, 2))
    # print("ANNOT COMBIMNATIONS: ", annots_combinations)
    #Filter combinations to only include gene-disease combinations
    annots_combinations = [annots for annots in annots_combinations if annots[0]['infons']['type'] != annots[1]['infons']['type']]
    # print("ANNOT COMBINATIONS - GENE-DISEASE: ", annots_combinations)

    #processing sentences
    sentences = text.split('. ')
    # print(sentences)
    sentence_offset = {}
    sentence_len = {}
    prev_sent_offset = offset

    for sentence in sentences:

      sentence_offset[sentence] = prev_sent_offset
      current_sentence_offset = prev_sent_offset
      sentence_len[sentence] = len(sentence)
      current_sentence_len = len(sentence)
      prev_sent_offset += len(sentence) + 2
      

      #Point to note: Duplicate sentences for as many combinations as present. 
      for annots in annots_combinations:
        difference = 0
        current_sentence = sentence
        #sort the tuple
        annots = sorted(annots, key = lambda x: x['locations'][0]['offset'])

        entity_1 = annots[0]
        entity_2 = annots[1]
        
        entity_1_offset = entity_1['locations'][0]['offset']
        entity_2_offset = entity_2['locations'][0]['offset']

        entity_1_dist = entity_1_offset - current_sentence_offset
        entity_2_dist = entity_2_offset - current_sentence_offset

        if (0 <= entity_1_dist <= ((current_sentence_len - len(entity_1['text'])) + 1)) and (0 <= entity_2_dist <= ((current_sentence_len - len(entity_2['text'])) + 1)):
          #the pair of annotations fall within the sentence
          sentence_entities[sentence_index] = (entity_1['text'], entity_2['text'])

          entity_1_type = entity_1['infons']['type']
          entity_1_length = entity_1['locations'][0]['length']
          temp = '@'+ entity_1_type +'$'
          entity_1_final_off = entity_1_dist 
          current_sentence = current_sentence[:entity_1_final_off] + "@" + entity_1_type + "$" + current_sentence[(entity_1_final_off + entity_1_length):]
          difference += (entity_1_length - len(temp))
          entity_2_type = entity_2['infons']['type']
          entity_2_length = entity_2['locations'][0]['length']
          temp = '@'+ entity_2_type +'$'
          entity_2_final_off = entity_2_dist - (difference)
          current_sentence = current_sentence[:entity_2_final_off] + "@" + entity_2_type + "$" + current_sentence[(entity_2_final_off + entity_2_length):]
          difference += (entity_2_length - len(temp))
          # tsv_writer_1.writerow([sentence_index, current_sentence])
          row1 = [sentence_index,current_sentence]
          df1.loc[len(df1)] = row1
          if (entity_1['infons']['type'] == 'Gene'):
            # print('Writing...', [sentence_index, sentence, entity_1['text'], entity_2['text']])
            # tsv_writer_2.writerow([sentence_index, sentence, entity_1['text'], entity_2['text']])
            row2 = [sentence_index, sentence, entity_1['text'], entity_2['text']]
            df2.loc[len(df2)] = row2
          else:
            # print('Writing')
            # tsv_writer_2.writerow([sentence_index, sentence, entity_2['text'], entity_1['text']])
            row2 = [sentence_index, sentence, entity_2['text'], entity_1['text']]
            df2.loc[len(df2)] = row2
          sentence_index += 1

In [5]:
import os
print(os.getcwd())
os.chdir('../biobert-pytorch/relation-extraction')

/home/skirupa/Desktop/sem8/RSL-Lab/RSLLab-20230318T070434Z-001/RSLLab/project-sample


In [6]:
df2.to_csv('pub_original_sentences_GM.tsv', sep="\t", index=False)
df1.to_csv('inputGM/test.tsv', sep="\t", index=False)

In [7]:
len(df2)

13

**BioBert Training**

In [8]:
import os
print(os.getcwd())
os.chdir('..')

/home/skirupa/Desktop/sem8/RSL-Lab/RSLLab-20230318T070434Z-001/RSLLab/biobert-pytorch/relation-extraction


In [9]:
# Download all datasets including NER/RE/QA
#!bash ./download.sh

In [10]:
print(os.getcwd())
os.chdir('relation-extraction')
print(os.getcwd())
#To preprocess the datasets downloaded
!bash ./preprocess.sh

/home/skirupa/Desktop/sem8/RSL-Lab/RSLLab-20230318T070434Z-001/RSLLab/biobert-pytorch
/home/skirupa/Desktop/sem8/RSL-Lab/RSLLab-20230318T070434Z-001/RSLLab/biobert-pytorch/relation-extraction
*****  euadr  Preprocessing Start *****
*****  euadr  Preprocessing Done *****
*****  GAD  Preprocessing Start *****
*****  GAD  Preprocessing Done *****


In [11]:
#!pip install scikit-learn
#!pip install pandas



In [12]:
%env SAVE_DIR=./outputGM
%env DATA="GAD"
%env SPLIT="1"
%env DATA_DIR=./inputGM
%env ENTITY=${DATA}-${SPLIT}

%env MAX_LENGTH=128
%env BATCH_SIZE=32
%env NUM_EPOCHS=3
%env SAVE_STEPS=1000
%env SEED=1

env: SAVE_DIR=./outputDM
env: DATA="GAD"
env: SPLIT="1"
env: DATA_DIR=./inputDM
env: ENTITY=${DATA}-${SPLIT}
env: MAX_LENGTH=128
env: BATCH_SIZE=32
env: NUM_EPOCHS=3
env: SAVE_STEPS=1000
env: SEED=1


In [None]:
!python run_re.py --task_name SST-2 --config_name bert-base-cased --model_name_or_path dmis-lab/biobert-base-cased-v1.1 \
        --do_predict --data_dir ${DATA_DIR} \
        --output_dir ${SAVE_DIR} \
        --overwrite_output_dir


In [None]:
original_sentences = pd.read_csv('pub_original_sentences_GM.tsv', sep="\t")
predictions = pd.read_csv('outputGM/test_results.txt', sep='\t')


In [None]:
#Merge the pub original sentences and test results (prediction of biobert model)

final_re_output = pd.merge(original_sentences, predictions, on ='index', how='left')
final_re_output