In [1]:
!sudo apt install tesseract-ocr
!sudo apt-get install poppler-utils 
!pip install pytesseract pdf2image zero-shot-re neo4j

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 39 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 2s (2,777 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/bioNLP

/content/drive/MyDrive/bioNLP


In [3]:
import requests
import pdf2image
import pytesseract

pdf = '/content/drive/MyDrive/bioNLP/2111.09414.pdf'
doc = pdf2image.convert_from_path(pdf)

# Get the article text
article = []
for page_number, page_data in enumerate(doc):
    txt = pytesseract.image_to_string(page_data).encode("utf-8")
    # Sixth page are only references
    if page_number < 6:
      article.append(txt.decode("utf-8"))
article_txt = " ".join(article)


In [4]:
article_txt

"Developmental Status and Perspectives for Tissue Engineering in Urology.\n\nElcin Huseyn\nAzerbaijan State Oil and Industry\nUniversity, Baku, Azerbaijan\n\nelcin.huseyn@asoiu.edu.az\n\nAbstract: Tissue engineering technology and tissue cell-based stem cell research have made\ngreat strides in treating tissue and organ damage, correcting tissue and organ dysfunction, and\nreducing surgical complications. In the past, traditional methods have used biological substitutes for\ntissue repair materials, while tissue engineering technology has focused on merging sperm cells\nwith biological materials to form biological tissues with the same structure and function as their\nown tissues. The advantage is that tissue engineering technology can overcome donors. Material\nprocurement restrictions can effectively reduce complications. The aim of studying tissue\nengineering technology is to find sperm cells and suitable biological materials to replace the\noriginal biological functions of tissues

## Text preprocessing
Now that we have the article content available, we will go ahead and remove section titles and figure descriptions from the text. Next, we will split the text into sentences.

In [5]:
import nltk
nltk.download('punkt')

def clean_text(text):
  """Remove section titles and figure descriptions from text"""
  clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)"))
                    and not row.startswith("Figure")])
  return clean

text = article_txt.split("Abstract")[1]
ctext = clean_text(text)
sentences = nltk.tokenize.sent_tokenize(ctext)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
import hashlib
import requests

#http://bern2.korea.ac.kr/documentation

def query_raw(text, url="http://bern2.korea.ac.kr/plain"):
    return requests.post(url, json={'text': text}).json()

# if __name__ == '__main__':
#     text = "Autophagy maintains tumour growth through circulating arginine."
#     print(query_plain(text))

entity_list = []
# The last sentence is invalid
for s in sentences[:-1]:
  entity_list.append(query_raw(s))

parsed_entities = []
for entities in entity_list:
  e = []
  # If there are not entities in the text
  if not entities.get('annotations'):
    parsed_entities.append({'text':entities['text'], 'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()})
    continue
  for entity in entities['annotations']:
    other_ids = [id for id in entity['id'] if not id.startswith("BERN")]
    entity_type = entity['obj']
    entity_name = entities['text'][entity['span']['begin']:entity['span']['end']]
    try:
      entity_id = [id for id in entity['id'] if id.startswith("BERN")][0]
    except IndexError:
      entity_id = entity_name
    e.append({'entity_id': entity_id, 'other_ids': other_ids, 'entity_type': entity_type, 'entity': entity_name})
  parsed_entities.append({'entities':e, 'text':entities['text'], 'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()})

In [7]:
import json
with open('article.txt', 'w') as f:
    f.write(article_txt)


json_string = json.dumps(parsed_entities)
with open('parsed_entities.json', 'w') as outfile:
    outfile.write(json_string)

In [8]:
from transformers import AutoTokenizer
from zero_shot_re import RelTaggerModel, RelationExtractor

model = RelTaggerModel.from_pretrained("fractalego/fewrel-zero-shot")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
relations = ['associated', 'interacts']
extractor = RelationExtractor(model, tokenizer, relations)

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import itertools
# Candidate sentence where there is more than a single entity present
candidates = [s for s in parsed_entities if (s.get('entities')) and (len(s['entities']) > 1)]
predicted_rels = []
for c in candidates:
  combinations = itertools.combinations([{'name':x['entity'], 'id':x['entity_id']} for x in c['entities']], 2)
  for combination in list(combinations):
    try:
      ranked_rels = extractor.rank(text=c['text'].replace(",", " "), head=combination[0]['name'], tail=combination[1]['name'])
      # Define threshold for the most probable relation
      if ranked_rels[0][1] > 0.85:
        predicted_rels.append({'head': combination[0]['id'], 'tail': combination[1]['id'], 'type':ranked_rels[0][0], 'source': c['text_sha256']})
    except:
      pass


In [None]:
json_string = json.dumps(predicted_rels)
with open('predicted_rels.json', 'w') as outfile:
    outfile.write(json_string)