In [4]:
### imports
import json
from copy import deepcopy

from nltk.tokenize import sent_tokenize, TweetTokenizer
from string import punctuation
import re
import regex
tokenizer = TweetTokenizer()
from nltk.corpus import stopwords
stopW = stopwords.words('english')
# stopW = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']
# print(sorted(stopW))

In [7]:
### a method to Read a json file with img captions and article titles and parse it into set of documents

def getDocs(sourcePath):
    medDocs = []
    sourceFile = open(sourcePath, 'r')

    count = 1
    for line in sourceFile:
        medDocs.append(json.loads(line))
        count += 1

    sourceFile.close()
    
    # for k in range(1,count):
    #     print(str(k) + " : " + medDocs[k]['docId'])
    return medDocs


sourcePath = 'assets/medImages_v0.jl'
medDocs = getDocs(sourcePath)
medDocs[262]

{'articleId': 'PMC7282154',
 'image_path': 'PMC7282154-6_i2376-0605-6-2-e50-f06.jpg',
 'image_url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7282154/bin/i2376-0605-6-2-e50-f06.jpg',
 'articleUrl': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7282154/',
 'articleTitle': 'RECURRENT INVASIVE DUCTAL BREAST CARCINOMA PRESENTING AS PRIMARY ADRENAL INSUFFICIENCY WITH ADRENAL CRISIS',
 'caption': 'Histologic section of adrenal metastatic disease (hematoxylin and eosin, Г—100). Tumor cells arranged in solid nests infiltrating lymph node tissue.'}

In [8]:
# method that cleans the title and caption text in each document
# the result is tokenized titles and caption without unwanted brackets, numbers, spaces, tags, unicode charts;
# result includes no punctuation and stopwards and tokens with length less than 2 symbols
# all tokens are lower case


def preprocessContent(content):
    sentences = sent_tokenize(content)
    tokens = []
    for sent in sentences:
        # removes unicode
        cleanedTokens = re.sub(r'[^\x00-\x7F]+', ' ', sent)
        # removes menions
        cleanedTokens = re.sub(r'@\w+', '', cleanedTokens)
        # removes numbers
#         cleanedTokens = re.sub('[0-9]+', '', cleanedTokens)
        # removes nested brackets 
        cleanedTokens = regex.sub(r'\([^()]*+(?:(?R)[^()]*)*+\)', '', cleanedTokens) 
        # removes nested currly brackets
        cleanedTokens = regex.sub('\{(?:[^}{]|\{[^}{]*\})*\}', '', cleanedTokens) 
        # removes html tags
        cleanedTokens = re.sub('\<+/*\w*/*\>+', '', cleanedTokens)
        # removes punctuation
        cleanedTokens = re.sub(r'[%s]' % re.escape(punctuation), ' ', cleanedTokens)
        # removes doubled spaces
        cleanedTokens = re.sub(r'\s{2,}', ' ', cleanedTokens)
        sentTokens = tokenizer.tokenize(cleanedTokens)
        # lower case each token and removes tokens with length less than 2
        sentTokens = [tok.lower() for tok in sentTokens if len(tok) >2]
        # removes stop words 
        sentTokens2 = [tok for tok in sentTokens if not tok.lower() in stopW ]
        tokens += sentTokens2
    return tokens


def cleanDocs(medDocs):
    cleanedDocs = deepcopy(medDocs)
    for i, doc in enumerate(cleanedDocs):
        doc['articleTitle'] = preprocessContent(doc['articleTitle'] )
        doc['caption'] = preprocessContent(doc['caption'])
    return cleanedDocs

cleanedDocs = cleanDocs(medDocs)

In [9]:
print(len(cleanedDocs))
print(medDocs[262]['caption'])
print(cleanedDocs[262]['caption'])
# cleanedDocs[262]

263
Histologic section of adrenal metastatic disease (hematoxylin and eosin, Г—100). Tumor cells arranged in solid nests infiltrating lymph node tissue.
['histologic', 'section', 'adrenal', 'metastatic', 'disease', 'tumor', 'cells', 'arranged', 'solid', 'nests', 'infiltrating', 'lymph', 'node', 'tissue']


In [19]:
# constructing inverted index
def getInvertedIdx(medDocs):
    invertedIdx = {}

    for i, doc in enumerate(medDocs):
        for term in doc['articleTitle']:
            if term in invertedIdx:
                invertedIdx[term].add((doc['articleId'],'t'))
            else:
                invertedIdx[term] = {(doc['articleId'],'t')}
        for term in doc['caption']:
            if term in invertedIdx:
                invertedIdx[term].add((doc['articleId'],'c'))
            else:
                invertedIdx[term] = {(doc['articleId'],'c')}
    return invertedIdx
           
termInvIdx = getInvertedIdx(cleanedDocs)
termInvIdx


{'novel': {('PMC4706273', 't'),
  ('PMC4706278', 't'),
  ('PMC4706287', 't'),
  ('PMC4779106', 'c'),
  ('PMC4779108', 't'),
  ('PMC7279775', 't'),
  ('PMC7279778', 't'),
  ('PMC7282159', 't')},
 'use': {('PMC4706274', 'c'),
  ('PMC4706274', 't'),
  ('PMC4779104', 'c'),
  ('PMC7279778', 't')},
 'stereotactic': {('PMC7279778', 'c'), ('PMC7279778', 't')},
 'radiotherapy': {('PMC7279778', 'c'), ('PMC7279778', 't')},
 'remnant': {('PMC7279778', 't')},
 'adrenal': {('PMC7279778', 'c'),
  ('PMC7279778', 't'),
  ('PMC7282148', 'c'),
  ('PMC7282154', 'c'),
  ('PMC7282154', 't'),
  ('PMC7282157', 'c'),
  ('PMC7282157', 't')},
 'tissue': {('PMC4706291', 'c'),
  ('PMC7279776', 'c'),
  ('PMC7279778', 't'),
  ('PMC7282154', 'c'),
  ('PMC7282159', 'c')},
 'nelson': {('PMC4779109', 'c'), ('PMC7279778', 't')},
 'syndrome': {('PMC7279770', 't'),
  ('PMC7279778', 't'),
  ('PMC7282157', 't'),
  ('PMC7282159', 'c')},
 'radiation': {('PMC4706291', 'c'), ('PMC7279778', 'c')},
 'planning': {('PMC7279778', 'c'

In [20]:
termInvIdx['biopsy']


{('PMC7282159', 'c')}

In [21]:
# Boolean operations:
def orPostings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    n1 = len(posting1)
    n2 = len(posting2)
    while p1 < n1 and p2 <n2:
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] >posting2[p2]:
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < n1:
        result.append(posting1[p1])
        p1 += 1
    while p2 < n2:
        result.append(posting2[p2])
        p2 += 1
    return result

def andPostings(posting1, posting2):
    p1 = 0
    p2 = 0
    n1 = len(posting1)
    n2 = len(posting2)
    result = list()
    
    while p1 < n1 and p2 < n2:
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            p2 += 1
        else:
            p1 += 1
            
    return result 
        

In [22]:

pl_1 = list(termInvIdx['biopsy'])
pl_2 = list(termInvIdx['tissues'])

#pl_3 = list(termInvIdx['histopatology'])
#pl_4 = list(termInvIdx['photomicrographs'])


pl_5 = list(termInvIdx['case'])
pl_6 = list(termInvIdx['report'])

orPostings(pl_1, pl_2)


[('PMC4706283', 'c'), ('PMC7282159', 'c')]

In [23]:

andPostings(pl_5, pl_6)

[('PMC7583549', 'c'), ('PMC4706292', 'c')]

In [24]:
orPostings(orPostings(pl_1, pl_2), andPostings(pl_5, pl_6))

[('PMC4706283', 'c'),
 ('PMC7282159', 'c'),
 ('PMC7583549', 'c'),
 ('PMC4706292', 'c')]