In [1]:
### imports
import json
import pandas as pd
from nltk.tokenize import sent_tokenize, TweetTokenizer
from string import punctuation
import re
import regex
tokenizer = TweetTokenizer()
from nltk.corpus import stopwords
stopW = stopwords.words('english')
# stopW = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']
# print(sorted(stopW))

In [2]:
### a method to Read a json file with img captions and article titles and parse it into set of documents

def getDocs(sourcePath):
    medDocs = []
    sourceFile = open(sourcePath, 'r')

    count = 1
    for line in sourceFile:
        medDocs.append(json.loads(line))
        count += 1

    sourceFile.close()
    
    # for k in range(1,count):
    #     print(str(k) + " : " + medDocs[k]['docId'])
    return medDocs


sourcePath = 'assets/medImages.jl'
medDocs = getDocs(sourcePath)
medDocs[262]

{'articleId': 'PMC7282154',
 'image_path': 'PMC7282154-6_i2376-0605-6-2-e50-f06.jpg',
 'image_url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7282154/bin/i2376-0605-6-2-e50-f06.jpg',
 'articleUrl': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7282154/',
 'articleTitle': 'RECURRENT INVASIVE DUCTAL BREAST CARCINOMA PRESENTING AS PRIMARY ADRENAL INSUFFICIENCY WITH ADRENAL CRISIS',
 'caption': 'Histologic section of adrenal metastatic disease (hematoxylin and eosin, Г—100). Tumor cells arranged in solid nests infiltrating lymph node tissue.'}

In [3]:
# method that cleans the title and caption text in each document
# the result is tokenized titles and caption without unwanted brackets, numbers, spaces, tags, unicode charts;
# result includes no punctuation and stopwards and tokens with length less than 2 symbols
# all tokens are lower case


def preprocessContent(content):
    sentences = sent_tokenize(content)
    tokens = []
    for sent in sentences:
        # removes unicode
        cleanedTokens = re.sub(r'[^\x00-\x7F]+', ' ', sent)
        # removes menions
        cleanedTokens = re.sub(r'@\w+', '', cleanedTokens)
        # removes numbers
        cleanedTokens = re.sub('[0-9]+', '', cleanedTokens)
        # removes nested brackets 
        cleanedTokens = regex.sub(r'\([^()]*+(?:(?R)[^()]*)*+\)', '', cleanedTokens) 
        # removes nested currly brackets
        cleanedTokens = regex.sub('\{(?:[^}{]|\{[^}{]*\})*\}', '', cleanedTokens) 
        # removes html tags
        cleanedTokens = re.sub('\<+/*\w*/*\>+', '', cleanedTokens)
        # removes punctuation
        cleanedTokens = re.sub(r'[%s]' % re.escape(punctuation), ' ', cleanedTokens)
        # removes doubled spaces
        cleanedTokens = re.sub(r'\s{2,}', ' ', cleanedTokens)
        sentTokens = tokenizer.tokenize(cleanedTokens)
        # lower case each token and removes tokens with length less than 2
        sentTokens = [tok.lower() for tok in sentTokens if len(tok) >2]
        # removes stop words 
        sentTokens2 = [tok for tok in sentTokens if not tok.lower() in stopW ]
        
        ### implement stemming! play = played and/or stemming ?!?
        tokens += sentTokens2
    return tokens


def cleanDocs(medDocs):
    cleanedDocs = medDocs
    for i, doc in enumerate(cleanedDocs):
        doc['articleTitle'] = preprocessContent(doc['articleTitle'] )
        doc['caption'] = preprocessContent(doc['caption'])
 
    return cleanedDocs

cleanedDocs = cleanDocs(medDocs)

In [4]:
print(len(cleanedDocs))
cleanedDocs[262]
# print(cleanedDocs[262]['articleTitle'])
# print(cleanedDocs[262]['caption'])

263


{'articleId': 'PMC7282154',
 'image_path': 'PMC7282154-6_i2376-0605-6-2-e50-f06.jpg',
 'image_url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7282154/bin/i2376-0605-6-2-e50-f06.jpg',
 'articleUrl': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7282154/',
 'articleTitle': ['recurrent',
  'invasive',
  'ductal',
  'breast',
  'carcinoma',
  'presenting',
  'primary',
  'adrenal',
  'insufficiency',
  'adrenal',
  'crisis'],
 'caption': ['histologic',
  'section',
  'adrenal',
  'metastatic',
  'disease',
  'tumor',
  'cells',
  'arranged',
  'solid',
  'nests',
  'infiltrating',
  'lymph',
  'node',
  'tissue']}

In [5]:
### create a bag of words 
def createBagsOfWOrds(corpus):
    bagOfWordsTitle = []
    bagOfWordsCaption = []
    for doc in corpus:
        for term in doc['articleTitle']:
            if term not in bagOfWordsTitle:
                bagOfWordsTitle.append(term)
        for term in doc['caption']:
            if term not in bagOfWordsCaption:
                bagOfWordsCaption.append(term)
    
    return bagOfWordsTitle, bagOfWordsCaption

bagOfWordsTitle, bagOfWordsCaption = createBagsOfWOrds(cleanedDocs)

print(bagOfWordsTitle[:10], bagOfWordsCaption[:10])

                

['novel', 'use', 'stereotactic', 'radiotherapy', 'remnant', 'adrenal', 'tissue', 'nelson', 'syndrome', 'manifestations'] ['adrenal', 'radiation', 'planning', 'depicted', 'computed', 'tomography', 'scan', 'abdomen', 'planned', 'target']


In [6]:
# constructing inverted index
def getInvertedIdx(medDocs):
    invertedIdx = {}

    for i, doc in enumerate(medDocs):
        for term in doc['articleTitle']:
            if term in invertedIdx:
                invertedIdx[term].append((i,'t'))
            else:
                invertedIdx[term] = [(i,'t')]
        for term in doc['caption']:
            if term in invertedIdx:
                invertedIdx[term].append((i,'c'))
            else:
                invertedIdx[term] = [(i,'c')]
    return invertedIdx
           
termInvIdx = getInvertedIdx(cleanedDocs)
termInvIdx


{'novel': [(0, 't'),
  (1, 't'),
  (16, 't'),
  (22, 't'),
  (23, 't'),
  (24, 't'),
  (25, 't'),
  (26, 't'),
  (27, 't'),
  (96, 't'),
  (97, 't'),
  (98, 't'),
  (99, 't'),
  (100, 't'),
  (102, 't'),
  (103, 't'),
  (106, 't'),
  (133, 't'),
  (134, 't'),
  (135, 't'),
  (136, 't'),
  (137, 't'),
  (138, 't'),
  (139, 't'),
  (140, 't'),
  (145, 'c'),
  (199, 't'),
  (200, 't'),
  (201, 't'),
  (202, 't'),
  (252, 't'),
  (253, 't')],
 'use': [(0, 't'),
  (1, 't'),
  (141, 't'),
  (142, 't'),
  (142, 'c'),
  (143, 't'),
  (209, 'c')],
 'stereotactic': [(0, 't'), (0, 'c'), (0, 'c'), (1, 't'), (1, 'c')],
 'radiotherapy': [(0, 't'), (0, 'c'), (0, 'c'), (1, 't'), (1, 'c')],
 'remnant': [(0, 't'), (1, 't')],
 'adrenal': [(0, 't'),
  (0, 'c'),
  (0, 'c'),
  (0, 'c'),
  (1, 't'),
  (28, 't'),
  (28, 't'),
  (29, 't'),
  (29, 't'),
  (29, 'c'),
  (30, 't'),
  (30, 't'),
  (30, 'c'),
  (30, 'c'),
  (247, 'c'),
  (248, 'c'),
  (249, 'c'),
  (249, 'c'),
  (250, 'c'),
  (257, 't'),
  (257, 't'

In [7]:
termInvIdx['biopsy']
termInvIdx['histologic']


[(262, 'c')]

In [8]:
# sort the dict
sortedTermsPostings = {}
sortedTermsPostings = sorted(termInvIdx.items(),key=lambda x: x[0])

sortedTermsPostings = {k:v for k,v in sortedTermsPostings}

sortedTermsPostings


{'aaa': [(84, 'c'), (85, 'c'), (86, 'c'), (86, 'c')],
 'abbreviations': [(243, 'c')],
 'abdomen': [(0, 'c'), (4, 'c'), (253, 'c'), (253, 'c'), (253, 'c')],
 'abdominal': [(29, 'c')],
 'abel': [(165, 'c')],
 'ability': [(46, 'c'), (47, 'c'), (79, 'c')],
 'able': [(86, 'c')],
 'abnormal': [(252, 'c')],
 'abnormalities': [(252, 'c'), (252, 'c')],
 'abnormality': [(0, 'c')],
 'abs': [(9, 'c'), (111, 'c'), (111, 'c')],
 'absence': [(22, 'c'), (35, 'c'), (174, 'c'), (175, 'c')],
 'absolute': [(44, 'c')],
 'absolutely': [(110, 'c')],
 'absorbed': [(237, 'c')],
 'absorptiometry': [(252, 'c')],
 'absorption': [(64, 't'),
  (65, 't'),
  (66, 't'),
  (104, 'c'),
  (104, 'c'),
  (104, 'c'),
  (104, 'c'),
  (107, 'c'),
  (107, 'c'),
  (107, 'c'),
  (108, 'c'),
  (108, 'c'),
  (108, 'c'),
  (109, 'c'),
  (109, 'c'),
  (109, 'c'),
  (110, 'c'),
  (110, 'c'),
  (110, 'c'),
  (110, 'c'),
  (110, 'c'),
  (111, 'c'),
  (111, 'c'),
  (111, 'c'),
  (111, 'c'),
  (158, 'c'),
  (195, 'c'),
  (195, 'c'),
  (2

In [9]:
sortedTermsPostings['biopsy']

[(252, 'c'), (253, 'c')]

In [10]:
### TF - the number of times a term appears in a document/ devided by the total number of terms -v01

def computeTFperDoc(doc):
    titleN = len(doc['articleTitle'])
    captionN = len(doc['caption'])
    totalN = titleN + titleN
#     print(doc['articleTitle'])
#     print('****************************')
#     print(doc['caption'])
#     print('****************************')
    bagOfWords = doc['articleTitle'] + doc['caption']
#     print(bagOfWords)
    termFreq = {}
    for term in bagOfWords:
        termFreq[term] = (doc['articleTitle'].count(term)/titleN, doc['caption'].count(term)/captionN)
    return termFreq


tf_262 = computeTFperDoc(cleanedDocs[262])
tf_262

def computeTF(corpus):
    tfDict = []
    for doc in corpus: 
        tfDict.append(computeTFperDoc(doc))
    return tfDict

tfDict = computeTF(cleanedDocs)
# print(tfDict)
print(tfDict[0])

{'novel': (0.1111111111111111, 0.0), 'use': (0.1111111111111111, 0.0), 'stereotactic': (0.1111111111111111, 0.02857142857142857), 'radiotherapy': (0.1111111111111111, 0.02857142857142857), 'remnant': (0.1111111111111111, 0.0), 'adrenal': (0.1111111111111111, 0.04285714285714286), 'tissue': (0.1111111111111111, 0.0), 'nelson': (0.1111111111111111, 0.0), 'syndrome': (0.1111111111111111, 0.0), 'radiation': (0.0, 0.02857142857142857), 'planning': (0.0, 0.014285714285714285), 'depicted': (0.0, 0.014285714285714285), 'computed': (0.0, 0.014285714285714285), 'tomography': (0.0, 0.014285714285714285), 'scan': (0.0, 0.014285714285714285), 'abdomen': (0.0, 0.014285714285714285), 'planned': (0.0, 0.014285714285714285), 'target': (0.0, 0.02857142857142857), 'volume': (0.0, 0.014285714285714285), 'sbrt': (0.0, 0.02857142857142857), 'included': (0.0, 0.02857142857142857), 'safety': (0.0, 0.014285714285714285), 'margin': (0.0, 0.014285714285714285), 'around': (0.0, 0.014285714285714285), 'radiologic'

In [48]:
### Compute TF - v02
def computeTermFreq(term, doc):
    return (doc['articleTitle'].count(term), doc['caption'].count(term))

def computeTF(term,doc):
    sumFreqT = 0
    sumFreqC = 0
    totalN = 0
    docText = doc['articleTitle'] + doc['caption'] 
    totalN = len(docText)
    if totalN > 0:
        for t in docText:
            freq= computeTermFreq(t, doc)
            sumFreqT += freq[0]
            sumFreqC += freq[1]
        try:
            termFreq = computeTermFreq(term, doc)
            return (termFreq[0]/sumFreqT, termFreq[1]/sumFreqC)
        except Exception as e:
            print(e)
            return (termFreq[0]/totalN, termFreq[1]/totalN)
    else:
        print("The document is empty!")

tf_1 = computeTF('novel', cleanedDocs[0])
print(tf_1)

        
def computeTFforCorpus(corpus):
    tfs = {}
    for doc in corpus:
        docText = docText = doc['articleTitle'] + doc['caption'] 
        for term in docText:
            tfs[term] = computeTF(term, doc)
#             print(term, tfs[term])
    return tfs
medDocsTFs = computeTFforCorpus(cleanedDocs)
print(len(medDocsTFs))
medDocsTFs

(0.0625, 0.0)
2587


{'novel': (0.04, 0.0),
 'use': (0.0, 0.041666666666666664),
 'stereotactic': (0.09090909090909091, 0.04),
 'radiotherapy': (0.09090909090909091, 0.04),
 'remnant': (0.09090909090909091, 0.0),
 'adrenal': (0.13333333333333333, 0.0625),
 'tissue': (0.0, 0.0625),
 'nelson': (0.0, 0.014492753623188406),
 'syndrome': (0.0, 0.0020833333333333333),
 'radiation': (0.0, 0.021671826625386997),
 'planning': (0.0, 0.009900990099009901),
 'depicted': (0.0, 0.009900990099009901),
 'computed': (0.0, 0.09090909090909091),
 'tomography': (0.0, 0.09090909090909091),
 'scan': (0.0, 0.0015432098765432098),
 'abdomen': (0.0, 0.004629629629629629),
 'planned': (0.0, 0.009900990099009901),
 'target': (0.0, 0.004291845493562232),
 'volume': (0.0, 0.014492753623188406),
 'sbrt': (0.0, 0.04),
 'included': (0.0, 0.007633587786259542),
 'safety': (0.0, 0.009900990099009901),
 'margin': (0.0, 0.007692307692307693),
 'around': (0.0, 0.0015432098765432098),
 'radiologic': (0.0, 0.009900990099009901),
 'abnormality':

In [44]:
### compute IDF 
import math 

def computeIDF(medDocs):
    idfTitle = {}
    idfCaption = {}
    totalN = len(medDocs)
    for i, doc in enumerate(medDocs):
        for term in doc['articleTitle']:
            if term in idfTitle:
                idfTitle[term].add(i)
            else:
                idfTitle[term] = {i}
        
        for term in doc['caption']:
            if term in idfCaption:
                idfCaption[term].add(i)
            else:
                idfCaption[term] = {i}

    for term in idfTitle:
        idfTitle[term] = math.log10(totalN/len(idfTitle[term]))
    for term in idfCaption:
        idfCaption[term] = math.log10(totalN/len(idfCaption[term]))
    
    return idfTitle, idfCaption

# idfT, idfC = computeIDF(cleanedDocs)
# # print(idfT)
# print(idfT['tissue'], idfC['tissue'])
idfs = computeIDF(cleanedDocs)
print(len(idfs[0]), len(idfs[1]))
idfs


452 2429


({'novel': 0.9285940546554852,
  'use': 1.720985744153739,
  'stereotactic': 2.1189257528257768,
  'radiotherapy': 2.1189257528257768,
  'remnant': 2.1189257528257768,
  'adrenal': 1.378563063331533,
  'tissue': 2.1189257528257768,
  'nelson': 2.1189257528257768,
  'syndrome': 1.465713239050433,
  'manifestations': 1.8178957571617955,
  'glucagonoma': 1.8178957571617955,
  'case': 1.2738277128115199,
  'acquired': 1.9428344937700954,
  'generalized': 1.720985744153739,
  'lipodystrophy': 1.720985744153739,
  'associated': 1.5168657614978143,
  'pembrolizumab': 1.9428344937700954,
  'patient': 1.9428344937700954,
  'metastatic': 1.8178957571617955,
  'malignant': 1.9428344937700954,
  'melanoma': 1.9428344937700954,
  'recurrent': 1.5168657614978143,
  'painful': 2.1189257528257768,
  'hashimoto': 2.1189257528257768,
  'thyroiditis': 2.1189257528257768,
  'successfully': 2.1189257528257768,
  'treated': 1.5168657614978143,
  'thyroidectomy': 1.8178957571617955,
  'hypothyroidism': 1.942

In [40]:
###  compute TF - IDF
### TF-IDF(document) = TF-IDF(title) * alpha + TF-IDF(body) * (1-alpha)


def computeTFIDF(corpus):
    alpha = 0.35
#     bagOfWordsTitle, bagOfWordsCaption = createBagsOfWOrds(cleanedcorpusDocs)
    idfs = computeIDF(corpus)
    tfidf = {}
    idfT = 0
    idfC = 0 
    for doc in corpus:
        docText = doc['articleTitle'] + doc['caption'] 
        for term in docText:
                tf = computeTF(term, doc)
                idfT = idfs[0][term]
                idfC = idfs[1][term]
                tfidfT = tf[0]*idfT
                tfidfC = tf[1]*idfC
                tfidf[term] = tfidfT*alpha + tfidfC*(1-alpha)
    return tfidf
            
        
    

In [41]:
medDocs_tfidf = computeTFIDF(cleanedDocs)

KeyError: 'remnant'