In [1]:
#We extract the pubmed document in BioCJSON format
import urllib3
import json
import csv
import requests
import pandas as pd
import io
import os
#import nltk
#nltk.download('punkt')
#from nltk.tokenize import word_tokenize
from itertools import combinations

http = urllib3.PoolManager()

pmcids = []


# Read the PMC IDs from the input file
with open("pmcid.txt", "r") as file:
    for line in file:
        pmcids.append(line.strip())
# Initialize an empty list to store the results
results = []

for pmcid in pmcids:
    r = http.request('GET', f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmcids={pmcid}')
    data = json.loads(r.data.decode('utf-8'))

    # Add the results to the list
    results.append(data)

In [2]:
output = json.dumps(data, indent=4)
#print(output)

'''
Every PMC id has passages.
Every passage has many {infons, offset, text, sentences, annotations, relations}.
Here, text is the actual text we have to annotate. 
Every annotations has {id, infons, text, locations}. 
Here infons has {identifier, type} (optional: ncbi-homologene if type is gene).
Also locations has {offset, length}.

Passages:
    a)infons - data realted article id, author, etc..
    b)offset - location index
    c)text - whole medical data (sentence) in which medical terms (gene name or disease name) are to be annotated.
    d)sentences - not required here
    e)annotations:
        1)id - key index
        2)infons:
              a)identifier
              b)type - "Gene" or "Disease" etc.,
        3)text - gene name or disease name etc., (Eg: "K-Ras")
        4)locations:
            a)offset - location index
            b)length - length of text. (Eg: len("tumours") = 7) 
    f)relations
'''




'\nEvery PMC id has passages.\nEvery passage has many {infons, offset, text, sentences, annotations, relations}.\nHere, text is the actual text we have to annotate. \nEvery annotations has {id, infons, text, locations}. \nHere infons has {identifier, type} (optional: ncbi-homologene if type is gene).\nAlso locations has {offset, length}.\n\nPassages:\n    a)infons - data realted article id, author, etc..\n    b)offset - location index\n    c)text - whole medical data (sentence) in which medical terms (gene name or disease name) are to be annotated.\n    d)sentences - not required here\n    e)annotations:\n        1)id - key index\n        2)infons:\n              a)identifier\n              b)type - "Gene" or "Disease" etc.,\n        3)text - gene name or disease name etc., (Eg: "K-Ras")\n        4)locations:\n            a)offset - location index\n            b)length - length of text. (Eg: len("tumours") = 7) \n    f)relations\n'

In [3]:
df1 = pd.DataFrame(columns=['index','sentence'])
df2 = pd.DataFrame(columns=['index','sentence','Gene','Mutation'])

**Annotations for Gene and Mutation**

In [4]:
sentence_index = 0
sentence_entities = {}
for j in range(len(results)):
  for i in results[j]['passages']:
    if i['infons']['section_type'] != 'TABLE':
      #filter the table segment
      text = i['text']
      # print("TEXT: ", text)
      offset = i['offset']
      # print("OFFSET: ", offset)    
      annotations = i['annotations']
      
      annotations = sorted(annotations, key = lambda x: x['locations'][0]['offset'])
      # print("ANNOTATIONS: ", annotations)
      #Filter to only include gene-disease annotations
      annotations = [annotation for annotation in annotations if ((annotation['infons']['type']=='Gene') or (annotation['infons']['type']=='Mutation'))]
      #List all possible combinations of annotations
      annots_combinations = list(combinations(annotations, 2))
      # print("ANNOT COMBIMNATIONS: ", annots_combinations)
      #Filter combinations to only include gene-disease combinations
      annots_combinations = [annots for annots in annots_combinations if annots[0]['infons']['type'] != annots[1]['infons']['type']]
      # print("ANNOT COMBINATIONS - GENE-DISEASE: ", annots_combinations)

      #processing sentences
      sentences = text.split('. ')
      # print(sentences)
      sentence_offset = {}
      sentence_len = {}
      prev_sent_offset = offset

      for sentence in sentences:

        sentence_offset[sentence] = prev_sent_offset
        current_sentence_offset = prev_sent_offset
        sentence_len[sentence] = len(sentence)
        current_sentence_len = len(sentence)
        prev_sent_offset += len(sentence) + 2
        

        #Point to note: Duplicate sentences for as many combinations as present. 
        for annots in annots_combinations:
          difference = 0
          current_sentence = sentence
          #sort the tuple
          annots = sorted(annots, key = lambda x: x['locations'][0]['offset'])

          entity_1 = annots[0]
          entity_2 = annots[1]
          
          entity_1_offset = entity_1['locations'][0]['offset']
          entity_2_offset = entity_2['locations'][0]['offset']

          entity_1_dist = entity_1_offset - current_sentence_offset
          entity_2_dist = entity_2_offset - current_sentence_offset

          if (0 <= entity_1_dist <= ((current_sentence_len - len(entity_1['text'])) + 1)) and (0 <= entity_2_dist <= ((current_sentence_len - len(entity_2['text'])) + 1)):
            #the pair of annotations fall within the sentence
            sentence_entities[sentence_index] = (entity_1['text'], entity_2['text'])

            entity_1_type = entity_1['infons']['type']
            entity_1_length = entity_1['locations'][0]['length']
            temp = '@'+ entity_1_type +'$'
            entity_1_final_off = entity_1_dist 
            current_sentence = current_sentence[:entity_1_final_off] + "@" + entity_1_type + "$" + current_sentence[(entity_1_final_off + entity_1_length):]
            difference += (entity_1_length - len(temp))
            entity_2_type = entity_2['infons']['type']
            entity_2_length = entity_2['locations'][0]['length']
            temp = '@'+ entity_2_type +'$'
            entity_2_final_off = entity_2_dist - (difference)
            current_sentence = current_sentence[:entity_2_final_off] + "@" + entity_2_type + "$" + current_sentence[(entity_2_final_off + entity_2_length):]
            difference += (entity_2_length - len(temp))
            # tsv_writer_1.writerow([sentence_index, current_sentence])
            row1 = [sentence_index,current_sentence]
            df1.loc[len(df1)] = row1
            if (entity_1['infons']['type'] == 'Gene'):
              # print('Writing...', [sentence_index, sentence, entity_1['text'], entity_2['text']])
              # tsv_writer_2.writerow([sentence_index, sentence, entity_1['text'], entity_2['text']])
              row2 = [sentence_index, sentence, entity_1['text'], entity_2['text']]
              df2.loc[len(df2)] = row2
            else:
              # print('Writing')
              # tsv_writer_2.writerow([sentence_index, sentence, entity_2['text'], entity_1['text']])
              row2 = [sentence_index, sentence, entity_2['text'], entity_1['text']]
              df2.loc[len(df2)] = row2
            sentence_index += 1

In [5]:
import os
print(os.getcwd())
os.chdir('../biobert-pytorch/relation-extraction')

/home/skirupa/Desktop/sem8/RSL-Lab/RSL Final/GDM-BioBert/project-sample


In [6]:
df2.to_csv('pub_original_sentences_GM.tsv', sep="\t", index=False)
df1.to_csv('inputGM/test.tsv', sep="\t", index=False)

In [17]:
len(df1)

178

**BioBert Training**

In [7]:
import os
print(os.getcwd())
os.chdir('..')

/home/skirupa/Desktop/sem8/RSL-Lab/RSL Final/GDM-BioBert/biobert-pytorch/relation-extraction


In [8]:
# Download all datasets including NER/RE/QA
#!bash ./download.sh

In [9]:
print(os.getcwd())
os.chdir('relation-extraction')
print(os.getcwd())
#To preprocess the datasets downloaded
!bash ./preprocess.sh

/home/skirupa/Desktop/sem8/RSL-Lab/RSL Final/GDM-BioBert/biobert-pytorch
/home/skirupa/Desktop/sem8/RSL-Lab/RSL Final/GDM-BioBert/biobert-pytorch/relation-extraction
*****  euadr  Preprocessing Start *****
*****  euadr  Preprocessing Done *****
*****  GAD  Preprocessing Start *****
*****  GAD  Preprocessing Done *****


In [10]:
#!pip install scikit-learn
#!pip install pandas

In [11]:
%env SAVE_DIR=./outputGM
%env DATA="GAD"
%env SPLIT="1"
%env DATA_DIR=./inputGM
%env ENTITY=${DATA}-${SPLIT}

%env MAX_LENGTH=128
%env BATCH_SIZE=32
%env NUM_EPOCHS=3
%env SAVE_STEPS=1000
%env SEED=1

env: SAVE_DIR=./outputGM
env: DATA="GAD"
env: SPLIT="1"
env: DATA_DIR=./inputGM
env: ENTITY=${DATA}-${SPLIT}
env: MAX_LENGTH=128
env: BATCH_SIZE=32
env: NUM_EPOCHS=3
env: SAVE_STEPS=1000
env: SEED=1


In [12]:
!python run_re.py --task_name SST-2 --config_name bert-base-cased --model_name_or_path dmis-lab/biobert-base-cased-v1.1 \
        --do_predict --data_dir ${DATA_DIR} \
        --output_dir ${SAVE_DIR} \
        --overwrite_output_dir


05/02/2023 11:29:11 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='./outputGM', overwrite_output_dir=True, do_train=False, do_eval=False, do_predict=True, evaluate_during_training=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/May02_11-29-02_skirupa', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=

In [13]:
original_sentences = pd.read_csv('pub_original_sentences_GM.tsv', sep="\t")
predictions = pd.read_csv('outputGM/test_results.txt', sep='\t')


In [14]:
#Merge the pub original sentences and test results (prediction of biobert model)

final_re_output = pd.merge(original_sentences, predictions, on ='index', how='left')
final_re_output

Unnamed: 0,index,sentence,Gene,Mutation,prediction
0,0,Four additional K-Ras mutations (Leu19Phe (1 o...,K-Ras,Leu19Phe,1.0
1,1,Four additional K-Ras mutations (Leu19Phe (1 o...,K-Ras,Lys117Asn,1.0
2,2,Four additional K-Ras mutations (Leu19Phe (1 o...,K-Ras,Ala146Thr,1.0
3,3,Four additional K-Ras mutations (Leu19Phe (1 o...,K-Ras,Arg164Gln,1.0
4,4,Lys117Asn and Ala146Thr had phenotypes similar...,K-Ras,Lys117Asn,1.0
...,...,...,...,...,...
173,173,In order to reconstitute HCC1937 cells with wi...,BRCA1,S11N,
174,174,This work was funded by the Spanish Ministries...,FIS,C1287/A8874,
175,175,The 185delAG mutation (c.68_69delAG) in the BR...,BRCA1,185delAG,
176,176,The 185delAG mutation (c.68_69delAG) in the BR...,BRCA1,c.68_69delAG,


In [15]:
#store the result into csv
#os.chdir('outputGM')
#final_re_output.to_csv('final_GM_output.tsv', sep="\t", index=False)

In [16]:
#store the dataframe into psql table
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

conn_string = 'postgresql://postgres:sinkiru@localhost:5432/gdmbiobert'

db = create_engine(conn_string)
conn = db.connect()

final_re_output.to_sql('gm', con=conn, if_exists='replace', index=False)
conn = psycopg2.connect(conn_string)
conn.autocommit = True
cursor = conn.cursor()

sql1 = '''select * from gm;'''
cursor.execute(sql1)
for i in cursor.fetchall():
	print(i)

# conn.commit()
conn.close()


(0, 'Four additional K-Ras mutations (Leu19Phe (1 out of 106 tumours), Lys117Asn (1 out of 106), Ala146Thr (7 out of 106) and Arg164Gln (1 out of 106)) were identified', 'K-Ras', 'Leu19Phe', 1.0)
(1, 'Four additional K-Ras mutations (Leu19Phe (1 out of 106 tumours), Lys117Asn (1 out of 106), Ala146Thr (7 out of 106) and Arg164Gln (1 out of 106)) were identified', 'K-Ras', 'Lys117Asn', 1.0)
(2, 'Four additional K-Ras mutations (Leu19Phe (1 out of 106 tumours), Lys117Asn (1 out of 106), Ala146Thr (7 out of 106) and Arg164Gln (1 out of 106)) were identified', 'K-Ras', 'Ala146Thr', 1.0)
(3, 'Four additional K-Ras mutations (Leu19Phe (1 out of 106 tumours), Lys117Asn (1 out of 106), Ala146Thr (7 out of 106) and Arg164Gln (1 out of 106)) were identified', 'K-Ras', 'Arg164Gln', 1.0)
(4, 'Lys117Asn and Ala146Thr had phenotypes similar to the hotspot mutations, whereas Leu19Phe had an attenuated phenotype and the Arg164Gln mutation was phenotypically equivalent to wt K-Ras', 'K-Ras', 'Lys117Asn