In [1]:
import numpy as np
import pandas as pd
import os
import json

In [2]:
EMBEDS_PATH = r"D:\Users\orosh\Documents\Research\Datasets\guidelines\MedCPT\acute_abdpain_subset\acute_abdpain_vectors.npy"
CHUNKS_PATH = r"D:\Users\orosh\Documents\Research\Datasets\guidelines\MedCPT\acute_abdpain_subset\_pubmed_combined.json"

In [3]:
#Compare how many embeds there are and how many chunks there are
embeds = np.load(EMBEDS_PATH, allow_pickle=True)
chunks = json.load(open(CHUNKS_PATH, "r"))
print(f"Embeds shape: {embeds.shape}")
print(f"Chunks length: {len(chunks)}")

Embeds shape: (124854, 768)
Chunks length: 124854


In [4]:
first_key = list(chunks)[0]
print(next(iter(chunks.values())))

{'d': '19751001', 't': 'Meso-appendicular testis.', 'a': '', 'm': 'adult!|appendicitis!|appendicitis!complications|appendicitis!complications*|appendix!|appendix!abnormalities|appendix!abnormalities*|cryptorchidism!|humans!|male!|'}


In [5]:
#Go over each chunk, count the total amount of chunks (a), the total amount of documents (d) and the average amount of characters per chunk (a)
total_chunks = 0
total_documents = 0
total_characters = 0
seen_documents = set()
for key, value in chunks.items():
    document_id = value['d']
    chunk = value['a']
    if document_id not in seen_documents:
        total_documents += 1
        seen_documents.add(document_id)
    total_chunks += 1
    total_characters += len(chunk)
print(f"Total chunks: {total_chunks}")
print(f"Total documents: {total_documents}")
print(f"Total characters: {total_characters}")
average_characters_per_chunk = total_characters / total_chunks
print(f"Average characters per chunk: {average_characters_per_chunk}")


Total chunks: 124854
Total documents: 10121
Total characters: 123419073
Average characters per chunk: 988.5071603633044


In [6]:
counts = {
    "appendicitis": [],
    "cholecystitis": [],
    "diverticulitis": [],
    "pancreatitis": [],
    "other": []
}

other_tags = []

pathologies = ['appendicitis', 'cholecystitis', 'diverticulitis', 'pancreatitis']

for key, value in chunks.items():
    tags = value['m']
    document_id = value['d']
    
    tag = "other"
    for pathology in pathologies:
        if pathology in tags:
            tag = pathology
            if document_id not in counts[tag]:
                counts[tag].append(document_id)

    if tag == "other":
        other_tags.append(tags)
        if document_id not in counts['other']:
            counts['other'].append(document_id)

print(f"Count of appendicitis: {len(counts['appendicitis'])}")
print(f"Count of cholecystitis: {len(counts['cholecystitis'])}")
print(f"Count of diverticulitis: {len(counts['diverticulitis'])}")
print(f"Count of pancreatitis: {len(counts['pancreatitis'])}")
print(f"Count of other pathologies: {len(counts['other'])}")
print(f"Total unique documents: {len(seen_documents)}")
print(f"sum of all counts: {sum(len(v) for v in counts.values())}")
SAVE_PATH = r"D:\Users\orosh\Documents\Research\Datasets\guidelines\MedCPT\acute_abdpain_subset\_docs_tags.json"
with open(SAVE_PATH, "w") as f:
    json.dump(counts, f, indent=4)
print(f"Saved counts to {SAVE_PATH}")

Count of appendicitis: 5707
Count of cholecystitis: 4108
Count of diverticulitis: 2979
Count of pancreatitis: 8362
Count of other pathologies: 7946
Total unique documents: 10121
sum of all counts: 29102
Saved counts to D:\Users\orosh\Documents\Research\Datasets\guidelines\MedCPT\acute_abdpain_subset\_docs_tags.json


In [7]:
#go over all the chunks, save all  the tags for a document in a dictionary
documents_tags = {}

for key, value in chunks.items():
    tags = value['m']
    document_id = value['d']

    if document_id not in documents_tags:
        documents_tags[document_id] = []

    if tags not in documents_tags[document_id]:
        documents_tags[document_id].append(tags)

#count how many documents have each tag
print("Documents with 0 tags:", sum(1 for tags in documents_tags.values() if len(tags) == 0))
print("Documents with 1 tag:", sum(1 for tags in documents_tags.values() if len(tags) == 1))
print("Documents with 2 tags:", sum(1 for tags in documents_tags.values() if len(tags) == 2))
print("Documents with 3 tags:", sum(1 for tags in documents_tags.values() if len(tags) == 3))
print("Documents with 4 tags:", sum(1 for tags in documents_tags.values() if len(tags) == 4))
print("Documents with 5 tags:", sum(1 for tags in documents_tags.values() if len(tags) == 5))
print("Documents with 5+ tags:", sum(1 for tags in documents_tags.values() if len(tags) > 5))

Documents with 0 tags: 0
Documents with 1 tag: 2018
Documents with 2 tags: 804
Documents with 3 tags: 509
Documents with 4 tags: 447
Documents with 5 tags: 407
Documents with 5+ tags: 5936


In [8]:
# counts = {
#     "appendicitis": [],
#     "cholecystitis": [],
#     "diverticulitis": [],
#     "pancreatitis": [],
#     "other": []
# }

documents = {

}

other_tags = []

#copy documents_tags dict
documents_tags_cp = documents_tags.copy()

for key, value in documents_tags_cp.items():
    tags = value
    added = ""
    for tag in tags:
        added += tag + " "
    documents_tags_cp[key] = added

pathologies = ['appendicitis', 'cholecystitis', 'diverticulitis', 'pancreatitis']

for key, value in documents_tags_cp.items():
    tags = value
    document_id = key
    
    tag = []
    for pathology in pathologies:
        if pathology in tags:
            tag.append(pathology)

    if document_id not in documents:
        documents[document_id] = tag

#Sort documents by the number of tags and alphabetical order in the tag array
documents = {k: v for k, v in sorted(documents.items(), key=lambda item: (len(item[1]), item[0]))}

print(f"Total documents with tags: {len(documents)}")

#how many documents have no tags
print(f"Documents with no tags: {sum(1 for v in documents.values() if len(v) == 0)}")
#how many documents have one tag
print(f"Documents with one tag: {sum(1 for v in documents.values() if len(v) == 1)}")
#how many documents have two tags
print(f"Documents with two tags: {sum(1 for v in documents.values() if len(v) == 2)}")
#how many documents have three tags
print(f"Documents with three tags: {sum(1 for v in documents.values() if len(v) == 3)}")
#how many documents have four tags
print(f"Documents with four tags: {sum(1 for v in documents.values() if len(v) == 4)}")

#total count of single tags
total_single_tags = sum(len(v) for v in documents.values())
print(f"Total single tags: {total_single_tags}")

#total count of appendicitis tags
total_appendicitis = sum(1 for v in documents.values() if 'appendicitis' in v)
print(f"Total appendicitis tags: {total_appendicitis}")
#total count of cholecystitis tags
total_cholecystitis = sum(1 for v in documents.values() if 'cholecystitis' in v)
print(f"Total cholecystitis tags: {total_cholecystitis}")
#total count of diverticulitis tags
total_diverticulitis = sum(1 for v in documents.values() if 'diverticulitis' in v)
print(f"Total diverticulitis tags: {total_diverticulitis}")
#total count of pancreatitis tags
total_pancreatitis = sum(1 for v in documents.values() if 'pancreatitis' in v)
print(f"Total pancreatitis tags: {total_pancreatitis}")

SAVE_PATH = r"D:\Users\orosh\Documents\Research\Datasets\guidelines\MedCPT\acute_abdpain_subset\_docs_tags_detailed.json"
with open(SAVE_PATH, "w") as f:
    json.dump(documents, f, indent=4)
print(f"Saved counts to {SAVE_PATH}")

Total documents with tags: 10121
Documents with no tags: 710
Documents with one tag: 2940
Documents with two tags: 2603
Documents with three tags: 2462
Documents with four tags: 1406
Total single tags: 21156
Total appendicitis tags: 5707
Total cholecystitis tags: 4108
Total diverticulitis tags: 2979
Total pancreatitis tags: 8362
Saved counts to D:\Users\orosh\Documents\Research\Datasets\guidelines\MedCPT\acute_abdpain_subset\_docs_tags_detailed.json


In [10]:
#Now do tags per chunk
chunks_tags = {}

for key, value in chunks.items():
    tags = value['m']
    chunk_id = key

    tag = []
    for pathology in pathologies:
        if pathology in tags:
            tag.append(pathology)

    chunks_tags[chunk_id] = tag

#Sort chunks by the number of tags and alphabetical order in the tag array
chunks_tags = {k: v for k, v in sorted(chunks_tags.items(), key=lambda item: (len(item[1]), item[0]))}

#Save the chunks_tags to a file
SAVE_PATH = r"D:\Users\orosh\Documents\Research\Datasets\guidelines\MedCPT\acute_abdpain_subset\_chunks_tags.json"
with open(SAVE_PATH, "w") as f:
    json.dump(chunks_tags, f, indent=4)
    
print(f"Total chunks with tags: {len(chunks_tags)}")
#how many chunks have no tags
print(f"Chunks with no tags: {sum(1 for v in chunks_tags.values() if len(v) == 0)}")
#how many chunks have one tag
print(f"Chunks with one tag: {sum(1 for v in chunks_tags.values() if len(v) == 1)}")
#how many chunks have two tags
print(f"Chunks with two tags: {sum(1 for v in chunks_tags.values() if len(v) == 2)}")
#how many chunks have three tags
print(f"Chunks with three tags: {sum(1 for v in chunks_tags.values() if len(v) == 3)}")
#how many chunks have four tags
print(f"Chunks with four tags: {sum(1 for v in chunks_tags.values() if len(v) == 4)}")
#total count of single tags
total_single_tags = sum(len(v) for v in chunks_tags.values())
print(f"Total single tags in chunks: {total_single_tags}")

#total count of appendicitis tags
total_appendicitis = sum(1 for v in chunks_tags.values() if 'appendicitis' in v)
print(f"Total appendicitis tags in chunks: {total_appendicitis}")
#total count of cholecystitis tags
total_cholecystitis = sum(1 for v in chunks_tags.values() if 'cholecystitis' in v)
print(f"Total cholecystitis tags in chunks: {total_cholecystitis}")
#total count of diverticulitis tags
total_diverticulitis = sum(1 for v in chunks_tags.values() if 'diverticulitis' in v)
print(f"Total diverticulitis tags in chunks: {total_diverticulitis}")
#total count of pancreatitis tags
total_pancreatitis = sum(1 for v in chunks_tags.values() if 'pancreatitis' in v)
print(f"Total pancreatitis tags in chunks: {total_pancreatitis}")


Total chunks with tags: 124854
Chunks with no tags: 49560
Chunks with one tag: 73842
Chunks with two tags: 1334
Chunks with three tags: 100
Chunks with four tags: 18
Total single tags in chunks: 76882
Total appendicitis tags in chunks: 15448
Total cholecystitis tags in chunks: 10649
Total diverticulitis tags in chunks: 4938
Total pancreatitis tags in chunks: 45847


In [21]:
#print ten random tags from the other_tags list and ten random tags from the multiple_tags list
import random
print("Ten random tags from other tags:")
for _ in range(10):
    print(random.choice(other_tags))

Ten random tags from other tags:
adolescent!|biomarkers!|biomarkers!urine|child!|child, preschool!|female!|humans!|hypercalciuria!|hypercalciuria!diagnosis|hypercalciuria!epidemiology|hypercalciuria!epidemiology*|hyperoxaluria!|hyperoxaluria!diagnosis|hyperoxaluria!epidemiology|hyperoxaluria!epidemiology*|infant!|jordan!|jordan!epidemiology|male!|predictive value of tests!|prognosis!|retrospective studies!|risk assessment!|risk factors!|up-regulation!|uric acid!|uric acid!urine|uric acid!urine*|
adolescent!|adult!|aged!|aged, 80 and over!|appendix!|appendix!diagnostic imaging|appendix!diagnostic imaging*|female!|humans!|male!|middle aged!|prospective studies!|reference values!|ultrasonography!|

abdomen!|coagulants!|coagulants!therapeutic use|coagulants!therapeutic use*|factor viia!|factor viia!genetics|factor viia!therapeutic use|factor viia!therapeutic use*|fatal outcome!|humans!|male!|middle aged!|postoperative hemorrhage!|postoperative hemorrhage!diagnostic imaging|postoperative he