In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import pydicom 
import os
import ast
from tqdm import tqdm
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
ROOT = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/'

In [None]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
train_df['image_id'].nunique() #omits missing values unlike unique()

In [None]:
len(set(os.listdir(ROOT+'train')))

In [None]:
list_images = []
for i in os.listdir(ROOT+'train'):
    i = i.split('.')[0]
    list_images.append(i)


In [None]:
act_images = []
for i in train_df['image_id']:
    if i in list_images:
        act_images.append(i)

In [None]:
train_new = train_df[train_df['image_id'].isin(act_images)]

In [None]:
train_new.image_id.nunique()

In [None]:
train_new.head()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(y='class_name',data=train_new,palette='deep')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(y='class_name',data=train_new[train_new['class_name']!='No finding'])
plt.show()

In [None]:
train_new.image_id.value_counts()

In [None]:
train_new[train_new['image_id'] == 'ecf474d5d4f65d7a3e23370a68b8c6a0']

In [None]:
train_new.rad_id.nunique()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(y='rad_id',data=train_new)
plt.show()

In [None]:
train_new.isna().sum()

In [None]:
train_new[train_new['class_name'] != 'No finding'].isna().sum(axis=0)

In [None]:
df = train_new[train_new['class_name'] != 'No finding']

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
path = ROOT + 'train/00dc70e84d141255f7fc6f8038bdd72e.dicom'

In [None]:
dicom = pydicom.dcmread(path)

In [None]:
print(dicom)

In [None]:
dicom.Rows,dicom.Columns,dicom.PatientSex

In [None]:
rows, columns, sex = [], [], []
ids = df['image_id'].unique()
for i in ids:
    path = ROOT + 'train/' + i + '.dicom'
    dicom = pydicom.dcmread(path, stop_before_pixels = True)
    rows.append(dicom.Rows)
    columns.append(dicom.Columns)
    sex.append(dicom.PatientSex)

In [None]:
info = pd.DataFrame({'image_id':ids, 'rows':rows, 'columns':columns, 'sex':sex})
info.head()

In [None]:
info.shape

In [None]:
plt.figure(figsize = (8,5))
sns.countplot(info['sex'], palette ='dark')

In [None]:

train = pd.merge(df,info)

In [None]:
train.head()

In [None]:

assert (train['x_min'] < train['columns']).all()
assert (train['x_min'] < train['x_max']).all()
assert (train['y_min'] < train['y_max']).all()
assert (train['x_max'] <= train['columns']).all()
assert (train['y_min'] < train['rows']).all()
assert (train['y_max'] <= train['rows']).all()

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(train['rows']*train['columns']);
plt.title("total pixels in images");

In [None]:
myfile = pydicom.read_file(path)
print(myfile)
plt.figure(figsize=(12,10))
plt.imshow(myfile.pixel_array,plt.cm.bone)
plt.show()

In [None]:
from pydicom.pixel_data_handlers.util import apply_voi_lut

def read_xray(path,voi_lut=True,fix_monochrome=True):
    
    dicom = pydicom.read_file(path)
    
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array,dicom)
    else:
        data = dicom.pixel_array
        
    if fix_monochrome and dicom.PhotometricInterpretation == 'MONOCHROME1':
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    return (data * 255).astype(np.uint8)

In [None]:
_ = plt.figure(figsize=(10,10))
plt.imshow(read_xray(path),cmap='gray');

In [None]:
import random
import matplotlib.patches as patches

_,axes = plt.subplots(4,4,figsize=(20,20))
for i in range(4):
    for j in range(4):
        path = ROOT + 'train/' + train.iloc[random.randint(0,len(train))]['image_id'] + '.dicom'
        
        axes[i][j].imshow(read_xray(path),cmap='gray')
plt.show()

In [None]:
train['class_name'].unique()

In [None]:

def plot(name):
    ttrain = train[train['class_name'] == name]
    fig, axes = plt.subplots(4,4, figsize=(20, 20))
    fig.suptitle(name+" examples", fontsize=16)
    for i in range(4):
        for j in range(4):
            row = ttrain.iloc[random.randint(0, len(ttrain))]
            path = ROOT + 'train/' + row['image_id'] + '.dicom'
            axes[i][j].imshow(read_xray(path), cmap='gray')
            axes[i][j].add_patch(patches.Rectangle(
                (row['x_min'], row['y_min']), 
                row['x_max'] - row['x_min'], 
                row['y_max'] - row['y_min'], 
                edgecolor='blue', 
                fill=False)
            )
    plt.show()

In [None]:
for name in train['class_name'].unique():
    plot(name)