## Getting the Tags

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET

In [2]:
unique_tags = []
directory = "C:/Users/Rahul Sai/Desktop/Capstone/medical-captioning/dataset-reports"

In [3]:
for filename in tqdm(os.listdir(directory)):
    if filename.endswith(".xml"):
        f = directory + '/' + filename
        tree = ET.parse(f)
        root = tree.getroot()

        for child in root:
          if child.tag == 'MeSH':
            for attr in child:
              if attr.tag == "automatic":
                text = attr.text.split(",")
                unique_tags.append(text[0].strip().lower())

100%|████████████████████████████████████████████████████████████████████████████| 3955/3955 [00:01<00:00, 3073.03it/s]


In [4]:
len(unique_tags)

7047

In [5]:
unique_tags_set = list(set(unique_tags))
unique_tags_set.sort()
unique_tags_set.insert(0,"normal")

## Cleaning dataset

In [6]:
rows = []

In [7]:
def getRow(filename):
    row = []
    f = directory + '/' + filename
    tree = ET.parse(f)
    root = tree.getroot()
    index_dict = dict((value, idx) for idx,value in enumerate(unique_tags_set))

    for p_image in root.findall('parentImage'):
        row.append(p_image.get('id'))

    if len(row) != 2:
        row.append("")
    
    if len(row) > 2:
        row = row[0:2]

    for child in root:
        if child.tag == 'MeSH':
            tags = []
            enc = []
            for attr in child:
                if attr.tag == "major":
                    if attr.text.lower() == "normal":
                        tags.append("normal")
                if attr.tag == "automatic":
                    text = attr.text.split(",")
                    tags.append(text[0].strip().lower())
            tags_int = [index_dict[x] for x in tags]
            for i in range(len(unique_tags_set)):
                if i in tags_int:
                    enc.append(1)
                else:
                    enc.append(0)
    row.extend(enc)
    return row

In [8]:
for filename in tqdm(os.listdir(directory)):
    if filename.endswith(".xml"):
        rows.append(getRow(filename))

100%|████████████████████████████████████████████████████████████████████████████| 3955/3955 [00:02<00:00, 1641.62it/s]


In [9]:
unique_tags_set.insert(0,"img_1")
unique_tags_set.insert(1,"img_2")

In [10]:
df = pd.DataFrame(rows, columns=unique_tags_set)

In [11]:
df

Unnamed: 0,img_1,img_2,normal,abdomen,abdominal surgery,absence of right pulmonary artery,absorptiometry,acromioclavicular separation,acute pneumonia,adenopathy,...,venous congestion,venous engorgement,venous hypertension,ventricular hypertrophy,vertebral fracture,vertebroplasty,viral bronchiolitis,viral pneumonias,volume overload,wounds
0,CXR1_1_IM-0001-3001,CXR1_1_IM-0001-4001,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,CXR10_IM-0002-1001,CXR10_IM-0002-2001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,CXR100_IM-0002-1001,CXR100_IM-0002-2001,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3,CXR1000_IM-0003-1001,CXR1000_IM-0003-2001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,CXR1001_IM-0004-1001,CXR1001_IM-0004-1002,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3950,CXR995_IM-2478-1001,CXR995_IM-2478-1002,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3951,CXR996_IM-2479-1001,CXR996_IM-2479-2001,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3952,CXR997_IM-2479-1001,CXR997_IM-2479-2001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3953,CXR998_IM-2479-1001,CXR998_IM-2479-2001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [12]:
df.to_csv("labels.csv")