
# Thoracic lung diseases :
### Thoracic disorders are conditions of the heart, lungs, mediastinum, esophagus, chest wall, diaphragm and great vessels and may include:
* Chronic obstructive pulmonary disease (COPD)
* Pulmonary embolism
* Lungs cancer,.....,etc


## Importing the libraries

In [None]:
from glob import glob # to read files
from os.path import splitext
from random import choice

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from  matplotlib import colors
import seaborn as sns
import missingno as msno  #to visualize the missing values
import plotly.express as px

import pydicom
from pydicom import read_file

import skimage
from skimage.io import imread


# Get a deep insights for our dataset

In [None]:
df = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
df.head()

In [None]:
shape = df.shape
print('The shape of our datase:'+" "+str(shape))

#### *Note:* That the size of the csv file is "67914" , Meanwhile in the overview it was mentined that we have 15,000 independently-labeled images and will be evaluated on a test set of 3,000 images ,So; for sure there is a duplication in our data we will handle this .

# Dealing with the duplicated records
### Firstly exploring them 

In [None]:
df['image_id'].value_counts()

In [None]:
df.loc[df['image_id']=='ecf474d5d4f65d7a3e23370a68b8c6a0',:]

In [None]:
duplication = df['image_id'].duplicated().sum()
print('The count of the duplication in our dataset:'+' '+str(duplication))
print('Unique value : '+" "+str(shape[0]-duplication))

## Reading the whole file of the train folder

In [None]:
pathes = glob('../input/vinbigdata-chest-xray-abnormalities-detection/train/*')
len(pathes)

In [None]:
# creatin a dicionarty of key('image_id') and value ('pathes')
pathes_dict = dict()
keys = [splitext(x)[0].split('/')[-1] for x in pathes]
pathes_dict = {keys[i]:pathes[i] for i in range(0,len(pathes))}

In [None]:
# list(pathes_dict.keys())
df['pathes'] = df['image_id'].map(pathes_dict)
df.head()

# Exploring Our Class Label

* Note : Having 14 class label as it was mentioned 

In [None]:
df['class_name'].value_counts()

## Visalizng the count of class_name

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=df ,y='class_name')
plt.title('Counts of the Classes',fontsize=20)

In [None]:
list_ = ['No finding','Aortic enlargement','Cardiomegaly'
         ,'Pulmonary fibrosis','Pleural thickening','Lung Opacity'
         ,'Pleural effusion','Other lesion','Nodule/Mass','Infiltration'
         ,'ILD','Calcification','Consolidation','Atelectasis','Pneumothorax']
fig = px.pie(df,values=df['class_name'].value_counts(),names=list_ 
       , color_discrete_sequence=px.colors.sequential.RdBu)

fig.show()

# check for the missing value and handle it

In [None]:
df.isna().sum()

### Visualizing the missing values

In [None]:
plt.figure(figsize=(5,5))
msno.bar(df)
plt.show()

In [None]:
#Visulizing the locations of the missing values
sns.heatmap(df.isna(),cmap='Blues')

*Note :*All the missing value of(x_min , y_min,x_max,y_max)for the No finding class

### Dealing with the missing values
#### Filling the missing value with zero

In [None]:
df = df.fillna(0, axis=0)
df.head()

In [None]:
# check for the missing values
count = df.isna().sum()
print('The count of the missing values :'+"\n"+str(count))


# Start fun with DICOM Images

## **What is DICOM?**
##### Digital Imaging and Communications in Medicine (DICOM) is the standard for the communication and management of medical imaging information and related data.DICOM is most commonly used for storing and transmitting medical images enabling the integration of medical imaging devices such as scanners, servers, workstations, printers, network hardware, and picture archiving and communication systems (PACS) from multiple manufacturers. It has been widely adopted by hospitals and is making inroads into smaller applications like dentists' and doctors' offices. 
for more info: [https://en.wikipedia.org/wiki/DICOM](http://)

## Get a deep insights about our .dicom images


In [None]:
from pydicom import read_file
rand_img = choice(df['pathes'])
print('our random data :'+' '+str(rand_img))
img = read_file(rand_img)
#print the meta date for the .dicom
print(img)

In [None]:
from skimage.transform import resize
import tqdm
def resize_img(img):
    rescaled_img = resize(img.pixel_array,(512,512))
    return rescaled_img


*Note: The dicom images have a additive informatio that we could manipulate them later on*

In [None]:

# plotting image with bounding box via matplotlib.patches 
def create_bbox(data, img):
    fig = plt.figure() 
    ax = fig.add_subplot(111) 
    ax.imshow(resize_img(img),cmap=plt.cm.bone)
    color_dict = {'No finding':'w','Aortic enlargement':'xkcd:sky blue','Cardiomegaly':'xkcd:green'
             ,'Pulmonary fibrosis':'xkcd:beige','Pleural thickening':'xkcd:purple'
                  ,'Lung Opacity':'xkcd:red','Pleural effusion':'xkcd:yellow','Other lesion':'xkcd:orange',
                  'Nodule/Mass':'xkcd:neon green','Infiltration':'xkcd:pale orange',
                  'ILD':'xkcd:blue','Calcification':'xkcd:white','Consolidation':'xkcd:murky green'
                  ,'Atelectasis':'xkcd:tomato','Pneumothorax':'xkcd:puke brown'}
    data['colors'] = data['class_name'].map(color_dict)
    scale = 5
    for i in range(0,len(data)):
        x, y  =int(data.iloc[i,4])/scale, int(data.iloc[i,5])/scale
        width, height = int(data.iloc[i,6])/scale, int(data.iloc[i,7])/scale
        color = data.iloc[i,9]
        rect = patches.Rectangle((x, y),
                                         width, height,
                                         linewidth = 1,
                                         edgecolor = str(color) ,
                                         facecolor = 'none')
        ax.add_patch(rect)
    

In [None]:
# get the data 
selected_img = df[df['pathes']==rand_img]
create_bbox(selected_img,img)
plt.show()

In [None]:
# visulizing various random images:

rand_list = [choice(df['pathes'])for x in range (0,5)]
# rand_list
fig = plt.figure(figsize=(20,10))

for i in range(0,5):
    img = read_file(rand_list[i])
    bbox_info = df[df['pathes']==rand_list[i]]
    create_bbox(bbox_info,img)

fig.show()


### Getting the additional infomation from .dicom in our dataframe