# VinBigData Chest X-ray Abnormalities Detection
### Automatically localize and classify thoracic abnormalities from chest radiographs

![image](https://storage.googleapis.com/kaggle-competitions/kaggle/24800/logos/header.png)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
ROOT = "../input/vinbigdata-chest-xray-abnormalities-detection/"

In [None]:
!ls {ROOT}

In [None]:
train = pd.read_csv(ROOT+'train.csv')
train.head()

In [None]:
sub = pd.read_csv(ROOT+'sample_submission.csv')
sub.head()

In [None]:
train.shape

In [None]:
sub.shape

In [None]:
import os
os.listdir(ROOT+'train/')[:4]

In [None]:
len(os.listdir(ROOT+'train/'))

In [None]:
len(os.listdir(ROOT+'test/'))

In [None]:
len(train.image_id.unique()), len(sub.image_id.unique())

In [None]:
train.head()

In [None]:
import seaborn as sns
fig = plt.figure(figsize=(6,6))
sns.countplot(y='class_name', data=train);

In [None]:
fig = plt.figure(figsize=(6,6))
sns.countplot(y ='class_name', data=train[train['class_name']!="No finding"]);

In [None]:
train.head()

In [None]:
train.image_id.value_counts()

In [None]:
train[train['image_id'] == '7a1d72be9ef473df66d225c53e61f77e']

In [None]:
len(train.rad_id.unique())

In [None]:
fig = plt.figure(figsize=(12,6))
sns.countplot(x='rad_id', data=train);

In [None]:
train.isna().sum(axis=0)

In [None]:
train[train['class_name']!="No finding"].isna().sum(axis=0)

In [None]:
new_train = train[train['class_name']!="No finding"]

In [None]:
new_train.head()

In [None]:
new_train.shape

In [None]:
new_train.describe()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(new_train['x_min']);

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(new_train['x_max']);

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(new_train['y_min']);

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(new_train['y_max']);

In [None]:
import pydicom

path = ROOT + 'train/4d390e07733ba06e5ff07412f09c0a92.dicom'

In [None]:
dicom = pydicom.dcmread(path)

In [None]:
print(dicom)

In [None]:
dir(dicom)

In [None]:
dicom.Rows

In [None]:
dicom.Columns

In [None]:
dicom.PatientSex

In [None]:
from tqdm import tqdm
rows, columns, sex = [], [], []
ids = new_train['image_id'].unique()
for i in ids:
    path = ROOT+ 'train/' + i + '.dicom'
    dicom = pydicom.dcmread(path, stop_before_pixels=True)
    rows.append(dicom.Rows)
    columns.append(dicom.Columns)
    sex.append(dicom.PatientSex)

In [None]:
info = pd.DataFrame({'image_id':ids, 'rows':rows, 'columns':columns, 'sex':sex})
info.head()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.countplot(info['sex']);

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(info['rows']);

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(info['columns']);

In [None]:
fig = plt.figure(figsize=(12,6))
ax = sns.scatterplot(x='rows', y='columns', data=info, alpha=0.3)
plt.title("row(x) column(x) scatter plot")
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
ax = sns.scatterplot(x='x_min', y='y_min', data=new_train, alpha=0.3)
plt.title("min coordinate scatter plot")
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
ax = sns.scatterplot(x='x_max', y='y_max', data=new_train, alpha=0.3)
plt.title("max coordinate scatter plot")
plt.show()

In [None]:
train = pd.merge(train, info)
train.head()

In [None]:
assert (train['x_min'] < train['columns']).all()
assert (train['x_min'] < train['x_max']).all()
assert (train['y_min'] < train['y_max']).all()
assert (train['x_max'] <= train['columns']).all()
assert (train['y_min'] < train['rows']).all()
assert (train['y_max'] <= train['rows']).all()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(train['rows']*train['columns']);
plt.title("total pixels in images");

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(train['x_max'] - train['x_min']);
plt.title("width of bounding box");

In [None]:
fig, axes = plt.subplots(7, 2, figsize=(16, 40), sharex=True)
fig.suptitle("width of bounding box for different categories", fontsize=16)
for j, i in enumerate(train.class_name.unique()):
    ttrain = train[train['class_name'] == i]
    sns.distplot(ttrain['x_max'] - ttrain['x_min'], ax=axes[j%7, j//7]);
    axes[j%7, j//7].title.set_text(i);
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(train['y_max'] - train['y_min']);
plt.title("height of bounding box");

In [None]:
fig, axes = plt.subplots(7, 2, figsize=(16, 40), sharex=True)
fig.suptitle("height of bounding box for different categories", fontsize=16)
for j, i in enumerate(train.class_name.unique()):
    ttrain = train[train['class_name'] == i]
    sns.distplot(ttrain['y_max'] - ttrain['y_min'], ax=axes[j%7, j//7]);
    axes[j%7, j//7].title.set_text(i);
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot((train['y_max'] - train['y_min']) * (train['x_max'] - train['x_min']));
plt.title("area of bounding box");

In [None]:
fig, axes = plt.subplots(7, 2, figsize=(16, 40), sharex=True)
fig.suptitle("area of bounding box for different categories", fontsize=16)
for j, i in enumerate(train.class_name.unique()):
    ttrain = train[train['class_name'] == i]
    sns.distplot((ttrain['y_max'] - ttrain['y_min']) * (ttrain['x_max'] - ttrain['x_min']), ax=axes[j%7, j//7]);
    axes[j%7, j//7].title.set_text(i);
plt.show()

In [None]:
# from here https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
from pydicom.pixel_data_handlers.util import apply_voi_lut

def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)

    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array

    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data

    data = data - np.min(data)
    data = data / np.max(data)
    return (data * 255).astype(np.uint8)

In [None]:
_ = plt.figure(figsize=(10, 10))
plt.imshow(read_xray(path), cmap='gray');

In [None]:
import random
import matplotlib.patches as patches

_, axes = plt.subplots(4,4, figsize=(20, 20))
for i in range(4):
    for j in range(4):
        path = ROOT + 'train/' + train.iloc[random.randint(0, len(train))]['image_id'] + '.dicom'
        axes[i][j].imshow(read_xray(path), cmap='gray');
plt.show()

In [None]:
train.head()

In [None]:
train['class_name'].unique()

In [None]:
def plot(name):
    ttrain = train[train['class_name'] == name]
    fig, axes = plt.subplots(4,4, figsize=(20, 20))
    fig.suptitle(name+" examples", fontsize=16)
    for i in range(4):
        for j in range(4):
            row = ttrain.iloc[random.randint(0, len(ttrain))]
            path = ROOT + 'train/' + row['image_id'] + '.dicom'
            axes[i][j].imshow(read_xray(path), cmap='gray')
            axes[i][j].add_patch(patches.Rectangle(
                (row['x_min'], row['y_min']), 
                row['x_max'] - row['x_min'], 
                row['y_max'] - row['y_min'], 
                edgecolor='blue', 
                fill=False)
            )
    plt.show()

In [None]:
plot("Cardiomegaly")

In [None]:
plot("Pleural effusion")

In [None]:
plot("Pleural thickening")

the notebook is still WIP but
### do upvote if it helped :)