# Automatic Diagnosis Generation Given Chest X-rays Data Preparation

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></span><ul class="toc-item"><li><span><a href="#Load-Text-Data" data-toc-modified-id="Load-Text-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Text Data</a></span></li><li><span><a href="#Load-Image-Data" data-toc-modified-id="Load-Image-Data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load Image Data</a></span></li></ul></li><li><span><a href="#Data-Preprocessing" data-toc-modified-id="Data-Preprocessing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Preprocessing</a></span></li><li><span><a href="#Conclusions" data-toc-modified-id="Conclusions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Conclusions</a></span></li><li><span><a href="#References" data-toc-modified-id="References-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>References</a></span></li></ul></div>

In [None]:
# mounting drive
from google.colab import drive
drive.mount('/content/gdrive')
drive_path = '/content/gdrive/My Drive/Assignments_Drive/Case_Study_2/Medical_Data'
# specifying paths
txt_path = drive_path + '/ecgen'
img_path = drive_path + '/images'

Mounted at /content/gdrive


In [None]:
# settings
from IPython.display import Javascript
display(Javascript('IPython.notebook.execute_cells_below()'))
import warnings
warnings.filterwarnings('ignore')

<IPython.core.display.Javascript object>

In [None]:
#!unzip '/content/gdrive/My Drive/Assignments_Drive/Case_Study_2/Medical_Data/data2_orient.csv.zip' -d '/content/gdrive/My Drive/Assignments_Drive/Case_Study_2/Medical_Data/'

In [None]:
# imports
#!pip install tensorflow-gpu==2.3
#!pip install scikit-learn==0.20.4
# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.txt.gz"

# from tensorflow.python.framework import ops
# ops.disable_eager_execution()

import os
from os import listdir
import io
import time
import re
import random
import pandas as pd
import numpy as np
from numpy import zeros
from numpy import array
from numpy import asarray
from numpy import save
from bs4 import BeautifulSoup
from tqdm import tqdm
import unicodedata
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from PIL import Image
from pickle import dump
from pickle import load 
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
print(sklearn.__version__)

import tensorflow 
print(tensorflow.__version__)
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.densenet import DenseNet121
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense 
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dropout
from tensorflow.keras import optimizers
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.utils import plot_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.regularizers import l1
from tensorflow.keras.backend import categorical_crossentropy

# setting the random seeds
SEED = 4
os.environ['PYTHONHASHSEED']=str(SEED)
# os.environ['TF_CUDNN_DETERMINISTIC'] = '4'  # new flag present in tf 2.0+
np.random.seed(SEED)
tensorflow.random.set_seed(SEED)
# import loss
cce = tensorflow.keras.losses.CategoricalCrossentropy()


0.22.2.post1
2.3.0


## Load Data

### Load Text Data

In [None]:
# get random reports from the directory
def get_reports(k=0):
    report_files = os.listdir(txt_path)
    if k == 0:
        return report_files
    return random.sample(report_files, k)

In [None]:
# get the dataframe 
def get_dataframe(xml_files):
    # now lets read each xml file and create a dataframe fron the data 

    # rows th the dataframe
    # 1. UID : The unique file id for each xml file
    # 2. FINDINGS : The findings that the doctor writes after viewing the x-rays of the patient
    # 3. NS_FINDINGS : The number of sentences in the findings section
    # 4. IMPRESSIONS : The final diagnosys that the doctor writes
    # 5. NS_IMPRESSIONS : The number of sentences in the impression section
    # 4. IMAGES : The list of images that are associated with a report
    # 7. NO_IMAGES : The number of images in a report

    # This list will be used to used store the contents of each xml file
    rows_list = []
    for xmlfile in tqdm(xml_files):
        # file path stores the path of the xml files
        filepath = txt_path + '/' + xmlfile
        # this dict is used to contents of a xml file as {tag: value}
        dict_row = {}
        with open(filepath, "r") as f:
            # reading the file
            contents = f.read()
            # here we use lxml to parse the contents of xml file
            soup = BeautifulSoup(contents, 'lxml')
            # this piece of code is used to find the id associated with each file
            uid_tags = soup.findAll('uid')
            # extract the uid from tag and put the uid in the directory
            # this piece of code will check if there are more than one uids in the xml
            cnt = 0
            for ut in uid_tags:
                # increase cnt value
                cnt +=1
                # check if more than one ids are present
                if cnt > 1:
                    # prit and break
                    print('more than one ids')
                    break
                # get the uid value
                file_id = ut.get('id')
                # put the value in the directory
                dict_row['UID'] = file_id
            # this piece of code extracts the text from the xml file where the tag is abstracttext  
            tags = soup.findAll(['abstracttext'])
            for t in tags:
                # extract labels
                label=t.get("label")

                # extract the FINDINGS 
                if label=='FINDINGS':
                    f_text=t.text
                    # put the values in a directory 
                    dict_row['FINDINGS'] = f_text
                    #lets add the no. of sentences 
                    # first we need to add a fullstop to the end in case the doctor forgot to put it
                    if f_text.endswith('.'):
                        f_sen = f_text.split('.')
                    else:
                        f_text = f_text + '.'
                        f_sen = f_text.split('.')
                    # array to store the sentences
                    sent_arr = []
                    for sent in f_sen:
                        # check if the sentence is blank
                        if sent != '':
                            # append the array
                            sent_arr.append(sent)
                    # find no. of sentences 
                    ns_findings = len(sent_arr)
                    # add f_sen to the directory
                    dict_row['NS_FINDINGS'] = ns_findings
            
                # extract the IMPRESSION 
                if label=='IMPRESSION':
                    i_text=t.text
                    # put the values in a directory 
                    dict_row['IMPRESSION'] = i_text
                    # lets add the no. of sentences 
                    # first we need to add a fullstop to the end in case the doctor forgot to put it
                    i_text = i_text + '.'
                    i_sen = i_text.split('.')
                    # array to store the sentences
                    sent_arr = []
                    for sent in i_sen:
                        # check if the sentence is blank
                        if sent != '':
                            # append the array
                            sent_arr.append(sent)
                    # find no. of sentences 
                    ns_impression = len(sent_arr)
                    # add f_sen to the directory
                    dict_row['NS_IMPRESSION'] = ns_impression

            # This piece of code will extract the image names from the XML files
            itags = soup.findAll(['parentimage'])
            # this array will hold the images in a xml file as there are more than one images in a xml file
            x_ray_images = []
            # for each image in the file we will put it in the array
            for element in itags:
                # extract the image name from the file
                x_ray = element.get('id')
                # append the array
                x_ray_images.append(x_ray)
            #finally put the array in the row dict
            dict_row['IMAGES'] = x_ray_images
            # lets find the no of images in a xml file
            no_images = len(x_ray_images)
            # add this to the dict
            dict_row['NO_IMAGES'] = no_images

        # append the dict to the row (for each xml file we will have a dict_row(dict of data) in the rows_list)
        rows_list.append(dict_row)

    # finally create a dataframe from the rows_list
    df = pd.DataFrame(rows_list)
    return df 
        

In [None]:
# this function will be used to clean the impressions and the findings
def remove_reports(df):
    # we will pick the dataset that has both findings and impressions and the number of images is greater than 1
    # we will exclude the impressions with no. of sentences > 3. This is because in the EDA we had seen that Findings generally have 3 or more sentences.
    df_final=df[(df['FINDINGS']!='') & (df['IMPRESSION']!='') & (df['NO_IMAGES'] > 1) & (df['NS_IMPRESSION'] < 3)]
    print(df_final.shape)   
    return df_final

In [None]:
# get the reports 
reports = get_reports()
reports_df = get_dataframe(reports)
final_reports_df = remove_reports(reports_df)
print(final_reports_df.shape)

100%|██████████| 3955/3955 [15:01<00:00,  4.39it/s]


(2467, 7)
(2467, 7)


In [None]:
final_reports_df.head()

Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES
0,CXR3691,The heart is normal in size. The mediastinum i...,5,No acute disease.,1,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]",2
2,CXR3682,The lungs are hypoventilated. There is no foca...,4,No acute cardiopulmonary abnormality.,1,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]",2
4,CXR3685,Calcified thoracic aorta. Mild rightward devia...,6,No acute cardiopulmonary findings.,1,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]",2
5,CXR37,The heart is normal in size. The mediastinum i...,4,No acute disease.,1,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0...",2
6,CXR3703,The XXXX examination consists of frontal and l...,6,No evidence of acute cardiopulmonary process.,1,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]",2


In [None]:
# now lets remove the reports that have only one x-ray associated to it
data1 = final_reports_df[final_reports_df['NO_IMAGES'] > 1]

In [None]:
data1.head()

Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES
0,CXR3691,The heart is normal in size. The mediastinum i...,5,No acute disease.,1,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]",2
2,CXR3682,The lungs are hypoventilated. There is no foca...,4,No acute cardiopulmonary abnormality.,1,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]",2
4,CXR3685,Calcified thoracic aorta. Mild rightward devia...,6,No acute cardiopulmonary findings.,1,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]",2
5,CXR37,The heart is normal in size. The mediastinum i...,4,No acute disease.,1,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0...",2
6,CXR3703,The XXXX examination consists of frontal and l...,6,No evidence of acute cardiopulmonary process.,1,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]",2


In [None]:
data1.NO_IMAGES.value_counts()

2    2330
3     127
4       9
5       1
Name: NO_IMAGES, dtype: int64

In [None]:
# get the df with 2 images
data2 = final_reports_df[final_reports_df['NO_IMAGES'] == 2]

In [None]:
data2.NO_IMAGES.value_counts()

2    2330
Name: NO_IMAGES, dtype: int64

In [None]:
# get the df with 3 images
data3 = final_reports_df[final_reports_df['NO_IMAGES'] == 3]

In [None]:
print(data3.UID.values)

['CXR3716' 'CXR372' 'CXR3809' 'CXR3797' 'CXR3819' 'CXR3858' 'CXR3930'
 'CXR3925' 'CXR3971' 'CXR3986' 'CXR42' 'CXR510' 'CXR519' 'CXR573' 'CXR587'
 'CXR62' 'CXR626' 'CXR663' 'CXR714' 'CXR712' 'CXR725' 'CXR751' 'CXR756'
 'CXR735' 'CXR771' 'CXR80' 'CXR863' 'CXR86' 'CXR850' 'CXR901' 'CXR896'
 'CXR932' 'CXR930' 'CXR999' 'CXR984' 'CXR974' 'CXR2840' 'CXR2829'
 'CXR2835' 'CXR282' 'CXR2796' 'CXR2858' 'CXR2890' 'CXR2930' 'CXR2936'
 'CXR2933' 'CXR2898' 'CXR2922' 'CXR2986' 'CXR304' 'CXR3076' 'CXR3108'
 'CXR3142' 'CXR3218' 'CXR3262' 'CXR3263' 'CXR3281' 'CXR3250' 'CXR3282'
 'CXR3275' 'CXR3283' 'CXR3324' 'CXR3332' 'CXR3362' 'CXR335' 'CXR3375'
 'CXR3390' 'CXR3495' 'CXR346' 'CXR3521' 'CXR3551' 'CXR3532' 'CXR3549'
 'CXR3637' 'CXR1919' 'CXR2041' 'CXR210' 'CXR2145' 'CXR2146' 'CXR220'
 'CXR2231' 'CXR2256' 'CXR227' 'CXR2321' 'CXR2336' 'CXR2338' 'CXR2398'
 'CXR2397' 'CXR2425' 'CXR2470' 'CXR2433' 'CXR2539' 'CXR251' 'CXR2594'
 'CXR2569' 'CXR261' 'CXR2611' 'CXR2680' 'CXR2727' 'CXR2745' 'CXR1007'
 'CXR1056' 'CXR1

In [None]:
data3.NO_IMAGES.value_counts()

3    127
Name: NO_IMAGES, dtype: int64

In [None]:
# get the df with 4 images
data4 = final_reports_df[final_reports_df['NO_IMAGES'] == 4]

In [None]:
print(data4.UID.values)

['CXR3932' 'CXR3965' 'CXR846' 'CXR3307' 'CXR3359' 'CXR2097' 'CXR2243'
 'CXR2560' 'CXR1015']


In [None]:
data4.NO_IMAGES.value_counts()

4    9
Name: NO_IMAGES, dtype: int64

In [None]:
# get the df with 5 images
data5 = final_reports_df[final_reports_df['NO_IMAGES'] == 5]

In [None]:
data5.NO_IMAGES.value_counts()

5    1
Name: NO_IMAGES, dtype: int64

__Now we have__
- 2330 reports with 2 Images
- 127 reports with 3 Images
- 9 Reports with 4 Images
- 1 Report with 5 Image

__We will create 2 Images for each report and for reports with more than 2 Images we will use the orientation csv file and for each image create a pair wit one front and one lateral image__


In [None]:
df_cols = list(data1.columns)
print(df_cols)
# add the orient and Image_pair cols
df_cols.append('ORIENT')
df_cols.append('IMAGE_PAIR')
print(df_cols)

['UID', 'FINDINGS', 'NS_FINDINGS', 'IMPRESSION', 'NS_IMPRESSION', 'IMAGES', 'NO_IMAGES']
['UID', 'FINDINGS', 'NS_FINDINGS', 'IMPRESSION', 'NS_IMPRESSION', 'IMAGES', 'NO_IMAGES', 'ORIENT', 'IMAGE_PAIR']


In [None]:
# this file contains the orientations of the images
data2_orient = pd.read_csv(drive_path + '/csv_orientation/data2_orient.csv')
data2_orient_list = data2_orient.Image_Orientation.values.tolist()

# cleaning the orient list  
orient2 = list()
# for each data in list
for dat in data2_orient_list:
    tmp_list = list()
    for tmp in dat:
        # get the orientation and append
        if tmp == 'f':
            tmp_list.append(tmp)
        elif tmp == 'l':
            tmp_list.append(tmp)
        else:
            continue
    orient2.append(tmp_list)

data2_uid_list = data2_orient.UID.values.tolist()

# create the new orientation dataframe
data2_orient_new = pd.DataFrame()
data2_orient_new['UID'] = data2_uid_list
data2_orient_new['ORIENT'] = orient2

# merge the original dataframe and the orientation dataframe
data2_with_orient = pd.merge(data2,data2_orient_new,on='UID')

print(data2_with_orient.shape)
data2_with_orient.head()

(2330, 8)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,ORIENT
0,CXR3691,The heart is normal in size. The mediastinum i...,5,No acute disease.,1,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]",2,"[f, l]"
1,CXR3682,The lungs are hypoventilated. There is no foca...,4,No acute cardiopulmonary abnormality.,1,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]",2,"[f, l]"
2,CXR3685,Calcified thoracic aorta. Mild rightward devia...,6,No acute cardiopulmonary findings.,1,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]",2,"[f, l]"
3,CXR37,The heart is normal in size. The mediastinum i...,4,No acute disease.,1,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0...",2,"[f, l]"
4,CXR3703,The XXXX examination consists of frontal and l...,6,No evidence of acute cardiopulmonary process.,1,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]",2,"[f, l]"


In [None]:
# here we will create a new col in the dataframe as IMAGE_PAIR and this will con tain the image pairs 
data2 = data2_with_orient
# getting the values from the data dataframe
# here we have 2 images for each report  

data2_val = data2.values
print(data2_val.shape)

# this will store the new data with image pairs
return_arr = list()
# loop for each row in the dataframe
for val in data2_val:
    # get single row and convert to list
    val = list(val)
    # get xray images
    xrays = val[5]
    # get the orient
    orientation = val[-1]
    # initialize the lists
    lateral_xrays = list()
    front_xrays = list()
    # sanity check
    if len(xrays) == len(orientation):
        # now we will populate the lateral_xrays and front_xrays
        for l in range(len(orientation)):
            if orientation[l] == 'f':
                front_xrays.append(xrays[l])
            if orientation[l] == 'l':
                lateral_xrays.append(xrays[l])
        # now we will create the xray pairs
        pairs = list()
        if len(front_xrays) >= 1 and len(lateral_xrays) >= 1:
            for fxray in front_xrays:
                for lxray in lateral_xrays:
                    tmp_pair = list()
                    tmp_pair.append(fxray)
                    tmp_pair.append(lxray)
                    pairs.append(tmp_pair)
        # appending the pairs
        for p in pairs:
            # create a temp list
            tmp = list()
            # append the original data row to the list
            tmp = val.copy()
            # at the end of the list we will append the image pair
            tmp.append(p)
            # finally append the new row with image pair to the return array
            return_arr.append(tmp)  

# create the dataframe 
data2_new = pd.DataFrame(return_arr, columns=df_cols)
print('shape of new dataframe is', data2_new.shape)
data2_new.drop(columns=['ORIENT'], inplace=True)
data2_new.head()

(2330, 8)
shape of new dataframe is (2326, 9)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,IMAGE_PAIR
0,CXR3691,The heart is normal in size. The mediastinum i...,5,No acute disease.,1,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]",2,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]"
1,CXR3682,The lungs are hypoventilated. There is no foca...,4,No acute cardiopulmonary abnormality.,1,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]",2,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]"
2,CXR3685,Calcified thoracic aorta. Mild rightward devia...,6,No acute cardiopulmonary findings.,1,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]",2,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]"
3,CXR37,The heart is normal in size. The mediastinum i...,4,No acute disease.,1,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0...",2,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0..."
4,CXR3703,The XXXX examination consists of frontal and l...,6,No evidence of acute cardiopulmonary process.,1,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]",2,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]"


In [None]:
df_cols = list(data1.columns)
print(df_cols)
df_cols.append('ORIENT')
df_cols.append('IMAGE_PAIR')
print(df_cols)

['UID', 'FINDINGS', 'NS_FINDINGS', 'IMPRESSION', 'NS_IMPRESSION', 'IMAGES', 'NO_IMAGES']
['UID', 'FINDINGS', 'NS_FINDINGS', 'IMPRESSION', 'NS_IMPRESSION', 'IMAGES', 'NO_IMAGES', 'ORIENT', 'IMAGE_PAIR']


In [None]:
# this file contains the orientations of the images
data3_orient = pd.read_csv(drive_path + '/csv_orientation/data3_orient.csv')
data3_orient_list = data3_orient.Image_Orientation.values.tolist()

# cleaning the orient list  
orient3 = list()
# for each data in list
for dat in data3_orient_list:
    tmp_list = list()
    for tmp in dat:
        # get the orientation and append
        if tmp == 'f':
            tmp_list.append(tmp)
        elif tmp == 'l':
            tmp_list.append(tmp)
        else:
            continue
    orient3.append(tmp_list)

data3_uid_list = data3_orient.UID.values.tolist()

# create the new orientation dataframe
data3_orient_new = pd.DataFrame()
data3_orient_new['UID'] = data3_uid_list
data3_orient_new['ORIENT'] = orient3

# merge the original dataframe and the orientation dataframe
data3_with_orient = pd.merge(data3,data3_orient_new,on='UID')

print(data3_with_orient.shape)
data3_with_orient.head()

(127, 8)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,ORIENT
0,CXR3716,The cardiomediastinal silhouette is normal in ...,4,Negative for acute abnormality.,1,"[CXR3716_IM-1856-1001, CXR3716_IM-1856-2001, C...",3,"[f, l, f]"
1,CXR372,Left retrocardiac airspace disease with blunte...,3,Left retrocardiac airspace disease could refle...,2,"[CXR372_IM-1858-0001-0001, CXR372_IM-1858-0001...",3,"[l, l, f]"
2,CXR3809,Cardiac and mediastinal contours are within no...,3,Negative chest x-XXXX.,1,"[CXR3809_IM-1919-1002001, CXR3809_IM-1919-1003...",3,"[f, l]"
3,CXR3797,XXXX XXXX and lateral chest examination was ob...,4,1. Left lower lobe air space opacities without...,2,"[CXR3797_IM-1910-0001-0001, CXR3797_IM-1910-00...",3,"[l, f, l]"
4,CXR3819,Frontal (on two cassettes) and lateral views o...,4,Continued severe cardiomegaly and/or pericardi...,2,"[CXR3819_IM-1926-1001, CXR3819_IM-1926-2001, C...",3,"[f, l, f]"


In [None]:
# here we will create a new col in the dataframe as IMAGE_PAIR and this will con tain the image pairs 
data3 = data3_with_orient
# getting the values from the data dataframe
# here we have 3 images for each report  

data3_val = data3.values
print(data3_val.shape)

# this will store the new data with image pairs
return_arr = list()
# loop for each row in the dataframe
for val in data3_val:
    # get single row and convert to list
    val = list(val)
    # get xray images
    xrays = val[5]
    # get the orient
    orientation = val[-1]
    # initialize the lists
    lateral_xrays = list()
    front_xrays = list()
    # sanity check
    if len(xrays) == len(orientation):
        # now we will populate the lateral_xrays and front_xrays
        for l in range(len(orientation)):
            if orientation[l] == 'f':
                front_xrays.append(xrays[l])
            if orientation[l] == 'l':
                lateral_xrays.append(xrays[l])
        # now we will create the xray pairs
        pairs = list()
        if len(front_xrays) >= 1 and len(lateral_xrays) >= 1:
            for fxray in front_xrays:
                for lxray in lateral_xrays:
                    tmp_pair = list()
                    tmp_pair.append(fxray)
                    tmp_pair.append(lxray)
                    pairs.append(tmp_pair)
        # appending the pairs
        for p in pairs:
            # create a temp list
            tmp = list()
            # append the original data row to the list
            tmp = val.copy()
            # at the end of the list we will append the image pair
            tmp.append(p)
            # finally append the new row with image pair to the return array
            return_arr.append(tmp)  

# create the dataframe 
data3_new = pd.DataFrame(return_arr, columns=df_cols)
print('shape of new dataframe is', data3_new.shape)
data3_new.drop(columns=['ORIENT'], inplace=True)
data3_new.head()

(127, 8)
shape of new dataframe is (252, 9)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,IMAGE_PAIR
0,CXR3716,The cardiomediastinal silhouette is normal in ...,4,Negative for acute abnormality.,1,"[CXR3716_IM-1856-1001, CXR3716_IM-1856-2001, C...",3,"[CXR3716_IM-1856-1001, CXR3716_IM-1856-2001]"
1,CXR3716,The cardiomediastinal silhouette is normal in ...,4,Negative for acute abnormality.,1,"[CXR3716_IM-1856-1001, CXR3716_IM-1856-2001, C...",3,"[CXR3716_IM-1856-3001, CXR3716_IM-1856-2001]"
2,CXR372,Left retrocardiac airspace disease with blunte...,3,Left retrocardiac airspace disease could refle...,2,"[CXR372_IM-1858-0001-0001, CXR372_IM-1858-0001...",3,"[CXR372_IM-1858-4004, CXR372_IM-1858-0001-0001]"
3,CXR372,Left retrocardiac airspace disease with blunte...,3,Left retrocardiac airspace disease could refle...,2,"[CXR372_IM-1858-0001-0001, CXR372_IM-1858-0001...",3,"[CXR372_IM-1858-4004, CXR372_IM-1858-0001-0002]"
4,CXR3797,XXXX XXXX and lateral chest examination was ob...,4,1. Left lower lobe air space opacities without...,2,"[CXR3797_IM-1910-0001-0001, CXR3797_IM-1910-00...",3,"[CXR3797_IM-1910-0001-0002, CXR3797_IM-1910-00..."


In [None]:
# this file contains the orientations of the images
data4_orient = pd.read_csv(drive_path + '/csv_orientation/data4_orient.csv')
data4_orient_list = data4_orient.Image_Orientation.values.tolist()

# cleaning the orient list and 
orient4 = list()
# for each data in list
for dat in data4_orient_list:
    tmp_list = list()
    for tmp in dat:
        # get the orientation and append
        if tmp == 'f':
            tmp_list.append(tmp)
        elif tmp == 'l':
            tmp_list.append(tmp)
        else:
            continue
    orient4.append(tmp_list)

data4_uid_list = data4_orient.UID.values.tolist()

# create the new orientation dataframe
data4_orient_new = pd.DataFrame()
data4_orient_new['UID'] = data4_uid_list
data4_orient_new['ORIENT'] = orient4

# merge the original dataframe and the orientation dataframe
data4_with_orient = pd.merge(data4,data4_orient_new,on='UID')

print(data4_with_orient.shape)
data4_with_orient.head()

(9, 8)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,ORIENT
0,CXR3932,The cardiac silhouette mediastinal contours ar...,4,No acute cardiopulmonary disease.,1,"[CXR3932_IM-2004-1002, CXR3932_IM-2004-1003, C...",4,"[f, f, l, f]"
1,CXR3965,The heart and lungs have XXXX XXXX in the inte...,3,No active disease.,1,"[CXR3965_IM-2028-1001-0001, CXR3965_IM-2028-10...",4,"[f, f, l, f]"
2,CXR846,Heart size and pulmonary vascularity appears n...,5,No evidence of active disease.,1,"[CXR846_IM-2368-0001-0001, CXR846_IM-2368-0001...",4,"[f, f, l, l]"
3,CXR3307,The cardiomediastinal silhouette is normal siz...,4,No acute cardiopulmonary disease. .,2,"[CXR3307_IM-1582-1002001, CXR3307_IM-1582-1003...",4,"[f, f, f, l]"
4,CXR3359,Heart size normal. No focal airspace disease. ...,3,No acute cardiopulmonary findings.,1,"[CXR3359_IM-1612-2001, CXR3359_IM-1612-3001, C...",4,"[l, f, f, l]"


In [None]:
# here we will create a new col in the dataframe as IMAGE_PAIR and this will con tain the image pairs 
data4 = data4_with_orient
# getting the values from the data dataframe
# here we have 3 images for each report  

data4_val = data4.values
print(data4_val.shape)

# this will store the new data with image pairs
return_arr = list()
# loop for each row in the dataframe
for val in data4_val:
    # get single row and convert to list
    val = list(val)
    # get xray images
    xrays = val[5]
    # get the orient
    orientation = val[-1]
    # initialize the lists
    lateral_xrays = list()
    front_xrays = list()
    # sanity check
    if len(xrays) == len(orientation):
        # now we will populate the lateral_xrays and front_xrays
        for l in range(len(orientation)):
            if orientation[l] == 'f':
                front_xrays.append(xrays[l])
            if orientation[l] == 'l':
                lateral_xrays.append(xrays[l])
        # now we will create the xray pairs
        pairs = list()
        if len(front_xrays) >= 1 and len(lateral_xrays) >= 1:
            for fxray in front_xrays:
                for lxray in lateral_xrays:
                    tmp_pair = list()
                    tmp_pair.append(fxray)
                    tmp_pair.append(lxray)
                    pairs.append(tmp_pair)
        # appending the pairs
        for p in pairs:
            # create a temp list
            tmp = list()
            # append the original data row to the list
            tmp = val.copy()
            # at the end of the list we will append the image pair
            tmp.append(p)
            # finally append the new row with image pair to the return array
            return_arr.append(tmp)  

# create the dataframe 
data4_new = pd.DataFrame(return_arr, columns=df_cols)
print('shape of new dataframe is', data4_new.shape)
data4_new.drop(columns=['ORIENT'], inplace=True)
data4_new.head()

(9, 8)
shape of new dataframe is (28, 9)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,IMAGE_PAIR
0,CXR3932,The cardiac silhouette mediastinal contours ar...,4,No acute cardiopulmonary disease.,1,"[CXR3932_IM-2004-1002, CXR3932_IM-2004-1003, C...",4,"[CXR3932_IM-2004-1002, CXR3932_IM-2004-1004]"
1,CXR3932,The cardiac silhouette mediastinal contours ar...,4,No acute cardiopulmonary disease.,1,"[CXR3932_IM-2004-1002, CXR3932_IM-2004-1003, C...",4,"[CXR3932_IM-2004-1003, CXR3932_IM-2004-1004]"
2,CXR3932,The cardiac silhouette mediastinal contours ar...,4,No acute cardiopulmonary disease.,1,"[CXR3932_IM-2004-1002, CXR3932_IM-2004-1003, C...",4,"[CXR3932_IM-2004-1005, CXR3932_IM-2004-1004]"
3,CXR3965,The heart and lungs have XXXX XXXX in the inte...,3,No active disease.,1,"[CXR3965_IM-2028-1001-0001, CXR3965_IM-2028-10...",4,"[CXR3965_IM-2028-1001-0001, CXR3965_IM-2028-2001]"
4,CXR3965,The heart and lungs have XXXX XXXX in the inte...,3,No active disease.,1,"[CXR3965_IM-2028-1001-0001, CXR3965_IM-2028-10...",4,"[CXR3965_IM-2028-1001-0002, CXR3965_IM-2028-2001]"


In [None]:
# this file contains the orientations of the images
data5_orient = pd.read_csv(drive_path + '/csv_orientation/data5_orient.csv')
data5_orient_list = data5_orient.Image_Orientation.values.tolist()

# cleaning the orient list and 
orient5 = list()
# for each data in list
for dat in data5_orient_list:
    tmp_list = list()
    for tmp in dat:
        # get the orientation and append
        if tmp == 'f':
            tmp_list.append(tmp)
        elif tmp == 'l':
            tmp_list.append(tmp)
        else:
            continue
    orient5.append(tmp_list)

data5_uid_list = data5_orient.UID.values.tolist()

# create the new orientation dataframe
data5_orient_new = pd.DataFrame()
data5_orient_new['UID'] = data5_uid_list
data5_orient_new['ORIENT'] = orient5

# merge the original dataframe and the orientation dataframe
data5_with_orient = pd.merge(data5,data5_orient_new,on='UID')

print(data5_with_orient.shape)
data5_with_orient.head()

(1, 8)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,ORIENT
0,CXR1303,"In the interval, a 3 cm uncalcified mass has d...",7,"XXXX right upper lobe mass, suspicious for neo...",2,"[CXR1303_IM-0199-1001-0001, CXR1303_IM-0199-10...",5,"[f, f, f, l, f]"


In [None]:
# here we will create a new col in the dataframe as IMAGE_PAIR and this will con tain the image pairs 
data5 = data5_with_orient
# getting the values from the data dataframe
# here we have 3 images for each report  

data5_val = data5.values
print(data5_val.shape)

# this will store the new data with image pairs
return_arr = list()
# loop for each row in the dataframe
for val in data5_val:
    # get single row and convert to list
    val = list(val)
    # get xray images
    xrays = val[5]
    # get the orient
    orientation = val[-1]
    # initialize the lists
    lateral_xrays = list()
    front_xrays = list()
    # sanity check
    if len(xrays) == len(orientation):
        # now we will populate the lateral_xrays and front_xrays
        for l in range(len(orientation)):
            if orientation[l] == 'f':
                front_xrays.append(xrays[l])
            if orientation[l] == 'l':
                lateral_xrays.append(xrays[l])
        # now we will create the xray pairs
        pairs = list()
        if len(front_xrays) >= 1 and len(lateral_xrays) >= 1:
            for fxray in front_xrays:
                for lxray in lateral_xrays:
                    tmp_pair = list()
                    tmp_pair.append(fxray)
                    tmp_pair.append(lxray)
                    pairs.append(tmp_pair)
        # appending the pairs
        for p in pairs:
            # create a temp list
            tmp = list()
            # append the original data row to the list
            tmp = val.copy()
            # at the end of the list we will append the image pair
            tmp.append(p)
            # finally append the new row with image pair to the return array
            return_arr.append(tmp)  

# create the dataframe 
data5_new = pd.DataFrame(return_arr, columns=df_cols)
print('shape of new dataframe is', data5_new.shape)
data5_new.drop(columns=['ORIENT'], inplace=True)
data5_new.head()

(1, 8)
shape of new dataframe is (4, 9)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,IMAGE_PAIR
0,CXR1303,"In the interval, a 3 cm uncalcified mass has d...",7,"XXXX right upper lobe mass, suspicious for neo...",2,"[CXR1303_IM-0199-1001-0001, CXR1303_IM-0199-10...",5,"[CXR1303_IM-0199-1001-0001, CXR1303_IM-0199-20..."
1,CXR1303,"In the interval, a 3 cm uncalcified mass has d...",7,"XXXX right upper lobe mass, suspicious for neo...",2,"[CXR1303_IM-0199-1001-0001, CXR1303_IM-0199-10...",5,"[CXR1303_IM-0199-1001-0002, CXR1303_IM-0199-20..."
2,CXR1303,"In the interval, a 3 cm uncalcified mass has d...",7,"XXXX right upper lobe mass, suspicious for neo...",2,"[CXR1303_IM-0199-1001-0001, CXR1303_IM-0199-10...",5,"[CXR1303_IM-0199-2001-0001, CXR1303_IM-0199-20..."
3,CXR1303,"In the interval, a 3 cm uncalcified mass has d...",7,"XXXX right upper lobe mass, suspicious for neo...",2,"[CXR1303_IM-0199-1001-0001, CXR1303_IM-0199-10...",5,"[CXR1303_IM-0199-2001-0003, CXR1303_IM-0199-20..."


In [None]:
# code to merge all the dataframes
# Now lets merge all the dataframes into new dataframe 

frames = [data2_new, data3_new, data4_new, data5_new]
data_new = pd.concat(frames)
print(data_new.shape)
data_new.head()

(2610, 8)


Unnamed: 0,UID,FINDINGS,NS_FINDINGS,IMPRESSION,NS_IMPRESSION,IMAGES,NO_IMAGES,IMAGE_PAIR
0,CXR3691,The heart is normal in size. The mediastinum i...,5,No acute disease.,1,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]",2,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]"
1,CXR3682,The lungs are hypoventilated. There is no foca...,4,No acute cardiopulmonary abnormality.,1,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]",2,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]"
2,CXR3685,Calcified thoracic aorta. Mild rightward devia...,6,No acute cardiopulmonary findings.,1,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]",2,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]"
3,CXR37,The heart is normal in size. The mediastinum i...,4,No acute disease.,1,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0...",2,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0..."
4,CXR3703,The XXXX examination consists of frontal and l...,6,No evidence of acute cardiopulmonary process.,1,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]",2,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]"


In [None]:
# Now we will remove the cols 'NO_IMAGES' and 'IMAGES' and rename the IMAGE_PAIR col as IMAGES

# removing the cols 'NO_IMAGES' and 'IMAGES'
data_pair = data_new.drop(['NO_IMAGES', 'IMAGES', 'NS_FINDINGS', 'NS_IMPRESSION'],axis = 1)

# rename the 'IMAGE_PAIR' col as 'IMAGES'
data = data_pair.rename(columns = {'IMAGE_PAIR':'IMAGES'})
print(data.shape) 
data.head()

(2610, 4)


Unnamed: 0,UID,FINDINGS,IMPRESSION,IMAGES
0,CXR3691,The heart is normal in size. The mediastinum i...,No acute disease.,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]"
1,CXR3682,The lungs are hypoventilated. There is no foca...,No acute cardiopulmonary abnormality.,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]"
2,CXR3685,Calcified thoracic aorta. Mild rightward devia...,No acute cardiopulmonary findings.,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]"
3,CXR37,The heart is normal in size. The mediastinum i...,No acute disease.,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0..."
4,CXR3703,The XXXX examination consists of frontal and l...,No evidence of acute cardiopulmonary process.,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]"


In [None]:
# sanity check for image pairs
values = data.values
for val in values:
    if len(val[3]) != 2:
        print(val)
        break

### Load Image Data

In [None]:
# extract features from each x-ray in the directory
# arch specifies the architecture of the cnn model
# the chexnet model is trained by stanford
def extract_features(directory, arch):
	# get the files
	files = listdir(directory)
	# load the model
	if arch == 'CheXnet':
		from tensorflow.keras.applications.densenet import preprocess_input
		print('Model is ',arch)
  		# loading and re-structureing the model final shape = (1,1024)
		input_shape = (224, 224, 3)
		img_input = Input(shape=input_shape)
		base_model = DenseNet121(
            include_top=False,
            input_tensor=img_input,
            input_shape=input_shape,
            pooling="avg")
		#CheXNet_weights.h5
		x = base_model.output
		predictions = Dense(14, activation="sigmoid", name="predictions")(x)
		model = Model(inputs=img_input, outputs=predictions)
		model.load_weights(drive_path + '/CheXNet_weights.h5')
		model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
		#model.outputs = [model.layers[-3].output]


	if arch == 'VGG16' :
		from tensorflow.keras.applications.vgg16 import preprocess_input
		print('Model is ',arch)
		# loading and re-structureing the model final shape = (1,4096)
		model = VGG16() 
		model = Model(inputs=model.inputs, outputs=model.layers[-5].output)

	# summarize the Model
	print(model.summary())
	# extract features from each photo
	features = dict(); features_list = list()
	l = len(files); idx = 0
	for name in tqdm(files):
		# load an image from file
		filename = directory + '/' + name
		# get image id and file type
		image_id = name.split('.')[0]
		file_type = name.split('.')[1]
		# check if it is an image file
		if file_type != 'png':
			continue
		if arch == 'InceptionV3':
			image = load_img(filename, target_size=(299, 299, 3))
		else:
			image = load_img(filename, target_size=(224, 224, 3))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the model
		image = preprocess_input(image)
		# get features
		feature = model(image, training = False)
  		# store in the dict
		features[image_id] = feature
	# validation
	print(feature.shape)
	return features


In [None]:
# extract CheXnet features from all images
image_features = extract_features(img_path, 'CheXnet')

Model is  CheXnet
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5


  0%|          | 0/7472 [00:00<?, ?it/s]

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
zero_padding2d (ZeroPadding2D)  (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 112, 112, 64) 9408        zero_padding2d[0][0]             
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1/conv[0][0]                 
_______________________________________________________________________________________

100%|██████████| 7472/7472 [1:02:50<00:00,  1.98it/s]

(1, 1024)





## Data Preprocessing

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    # specific
    w = re.sub(r"won't", "will not", w)
    w = re.sub(r"can\'t", "can not", w)
    # general
    w = re.sub(r"n\'t", " not", w)
    w = re.sub(r"\'re", " are", w)
    w = re.sub(r"\'s", " is", w)
    w = re.sub(r"\'d", " would", w)
    w = re.sub(r"\'ll", " will", w)
    w = re.sub(r"\'t", " not", w)
    w = re.sub(r"\'ve", " have", w)
    w = re.sub(r"\'m", " am", w)
    w = re.sub(r'[" "]+', " ", w)
    w = w.replace('\\r', ' ')
    w = w.replace('\\"', ' ')
    w = w.replace('\\n', ' ')
    w = w.replace('x', '')
    w = w.replace('_', '')
    w = w.replace('-', '')
    w = w.replace('<br>', ' ')
    #replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?!,¿]+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [None]:
def get_image_features(dataset):
    # add a col to the Dataset
    df_cols = list(dataset.columns)
    print(df_cols)
    df_cols.append('IMAGE_FEATURE_1')
    df_cols.append('IMAGE_FEATURE_2')
    print(df_cols)
    # some initializations
    new_list = list()
    # get the values in the dataset
    values = dataset.values
    # length of the dataset
    for val in values:
        # code to get the image list in a report  
        z = val[3]
        # n is the no. of images in a report
        n = len(z)
        # this variable will store the combined image vector
        imgs = []
        # for each image in a report
        for img_id in z:
            # we will save the 2 image features in an array
            tmp_img = image_features[img_id]
            imgs.extend(tmp_img)
        # append the image features
        lst = val.tolist().copy()
        lst.append(imgs[0])
        lst.append(imgs[1])
        new_list.append(lst)
    # print the shape of the extracted feature
    img_extract_df = pd.DataFrame(new_list, columns=df_cols)
    return img_extract_df

In [None]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(dataset):

    uid = dataset['UID']
    # get findings and impression
    findings = dataset['FINDINGS'].values
    impression = dataset['IMPRESSION'].values
    # preprocess findings and impressions
    preprocessed_findings = list()
    preprocessed_impression = list()

    preprocessed_findings = [preprocess_sentence(w) for w in findings]  
    preprocessed_impression = [preprocess_sentence(w) for w in impression]

    images = dataset['IMAGES']
    # create new dataset
    new_df = pd.DataFrame()
    new_df['UID'] = uid
    new_df['FINDINGS'] = preprocessed_findings
    new_df['IMPRESSION'] = preprocessed_impression
    new_df['IMAGES'] = images
    
    img_extract_df = get_image_features(new_df)  

    return img_extract_df

In [None]:
data_original = data.copy()
data_final = create_dataset(data)
data_final.head()

['UID', 'FINDINGS', 'IMPRESSION', 'IMAGES']
['UID', 'FINDINGS', 'IMPRESSION', 'IMAGES', 'IMAGE_FEATURE_1', 'IMAGE_FEATURE_2']


Unnamed: 0,UID,FINDINGS,IMPRESSION,IMAGES,IMAGE_FEATURE_1,IMAGE_FEATURE_2
0,CXR3691,<start> the heart is normal in size the medias...,<start> no acute disease <end>,"[CXR3691_IM-1842-1001, CXR3691_IM-1842-3003]","(tf.Tensor(0.00026685063, shape=(), dtype=floa...","(tf.Tensor(4.9858256e-05, shape=(), dtype=floa..."
1,CXR3682,<start> the lungs are hypoventilated there is ...,<start> no acute cardiopulmonary abnormality <...,"[CXR3682_IM-1834-1001, CXR3682_IM-1834-2001]","(tf.Tensor(0.00033830438, shape=(), dtype=floa...","(tf.Tensor(6.356468e-05, shape=(), dtype=float..."
2,CXR3685,<start> calcified thoracic aorta mild rightwar...,<start> no acute cardiopulmonary findings <end>,"[CXR3685_IM-1836-1001, CXR3685_IM-1836-1002]","(tf.Tensor(0.00016475626, shape=(), dtype=floa...","(tf.Tensor(0.0002226108, shape=(), dtype=float..."
3,CXR37,<start> the heart is normal in size the medias...,<start> no acute disease <end>,"[CXR37_IM-1847-0001-0001, CXR37_IM-1847-0001-0...","(tf.Tensor(2.0698715e-05, shape=(), dtype=floa...","(tf.Tensor(0.00041303012, shape=(), dtype=floa..."
4,CXR3703,<start> the eamination consists of frontal and...,<start> no evidence of acute cardiopulmonary p...,"[CXR3703_IM-1850-1001, CXR3703_IM-1850-2001]","(tf.Tensor(0.0003913842, shape=(), dtype=float...","(tf.Tensor(5.5506534e-06, shape=(), dtype=floa..."


In [None]:
print(data_final.shape[0])

data_final.to_pickle(drive_path + '/data_final.pkl')

2610


## Conclusions

- In this Notebook we prepared data for the Model
- Also we used ChexNet to extract features from the X-Ray Images

## References

- https://github.com/nagapavan525/radiology-report-generation/blob/master/NewIntegrationWithIndication/1_Capstone-Radiology-PreProcessing.ipynb
- https://www.crummy.com/software/BeautifulSoup/bs4/doc/
- https://stackoverflow.com/questions/2612548/extracting-an-attribute-value-with-beautifulsoup
- https://stackoverflow.com/questions/24962673/beautiful-soup-getting-tag-id
- https://stackoverflow.com/a/47091490/4084039
- https://www.appservgrid.com/psam/Python_Samplifier--python1compute--Python_Program_to_Find_the_Size_(Resolution)_of_a_Image.html
- https://www.geeksforgeeks.org/working-images-python/
- https://gist.github.com/sebleier/554280
- https://stackoverflow.com/questions/27488446/how-do-i-get-word-frequency-in-a-corpus-using-scikit-learn-countvectorizer
- https://www.geeksforgeeks.org/python-remove-all-digits-from-a-list-of-strings/
- https://stackoverflow.com/questions/12851791/removing-numbers-from-string
- https://github.com/nagapavan525/radiology-report-generation/blob/master/radiology_report_generation_final/AutomatedRadiologyReportGenerationWithSentenceEmbeddings.ipynb
- https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/