In [11]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import cv2
from tqdm import tqdm_notebook as tqdm
import glob

In [12]:
import os
import sys
sys.path.append("../")

# Read raw csv

In [13]:
df = pd.read_csv("../data/raw/train-rle.csv", dtype="str", sep=", ")
df = df.groupby("ImageId")["EncodedPixels"].apply(list).reset_index()
df.iloc[0]["EncodedPixels"][0]

  """Entry point for launching an IPython kernel.


'-1'

In [14]:
df.head()

Unnamed: 0,ImageId,EncodedPixels
0,1.2.276.0.7230010.3.1.4.8323329.1000.151787516...,[-1]
1,1.2.276.0.7230010.3.1.4.8323329.10000.15178752...,[-1]
2,1.2.276.0.7230010.3.1.4.8323329.10001.15178752...,[-1]
3,1.2.276.0.7230010.3.1.4.8323329.10002.15178752...,[-1]
4,1.2.276.0.7230010.3.1.4.8323329.10003.15178752...,[-1]


In [15]:
def _iterate_dcm_dir(in_dir="../data/raw/dicom-images-train/"):
    if not in_dir.endswith("/"):
        in_dir += "/"
    for fp in glob.glob(in_dir + '*/*/*.dcm'):
        yield fp
def id_from_filename(filename):
    return os.path.splitext(filename)[0]

In [None]:
for i, fp in enumerate(_iterate_dcm_dir("../data/raw/dicom-images-train/")):
    entry_id = id_from_filename(os.path.basename(fp))
    df.loc[df.ImageId == entry_id, "fp"] = fp


In [None]:
df.head()

In [None]:
df["empty"] = df.EncodedPixels.apply(lambda x: x[0] == "-1")

# Processed csv

In [None]:
df.head()

# Work with photos

In [None]:
import pydicom
def read_dcm(fp, v=0):
    """
    Read dcom file
    :param fp: str: dcom file path
    :return: np.ndarray: img as numpy array
    """
    ds = pydicom.read_file(fp)  # read dicom image
    img = ds.pixel_array  # get image array
    if v:
        print(ds)
    return img, ds.ViewPosition

In [None]:
fp = df.fp[0]
ds = pydicom.read_file(fp)
sample_img = ds.pixel_array

In [None]:
ds.ViewPosition

In [None]:
print(ds)

In [None]:
# %matplotlib auto
print(pd.Series(sample_img.flatten()).describe())
plt.imshow(sample_img, cmap=plt.cm.bone), plt.show()
# cv2.imshow("Chest",sample_img), cv2.waitKey(0), cv2.destroyAllWindows()

In [None]:
%matplotlib inline
plt.hist(sample_img.flatten(), bins=190), plt.show();

In [None]:
plt.plot(pd.Series(sample_img.flatten()).value_counts().sort_index()), plt.show()

In [None]:
sample_img.max()

# Sample img save and restore

In [None]:
test_img = sample_img.copy()
for i in range(20):
    cv2.imwrite("test_save.png", test_img)
    test_img = cv2.imread("test_save.png", cv2.IMREAD_GRAYSCALE)
(test_img == sample_img).mean()

-- end sample img

In [None]:
for i, row in df[["empty", "fp"]].iterrows():
    img, vp = read_dcm(row["fp"])

    print("Before", row["empty"], vp, np.mean(img))
    dist = pd.Series(img.flatten()).value_counts().sort_index()
    plt.figure(figsize=(15, 5))
    plt.subplot(121)
    plt.imshow(img)
    plt.subplot(122)
    plt.plot(dist)
    plt.show()
    
    img[img == 0] = 255
    print("After", row["empty"], vp, np.mean(img))
    dist = pd.Series(img.flatten()).value_counts().sort_index()
    plt.figure(figsize=(15, 5))
    plt.subplot(121)
    plt.imshow(img)
    plt.subplot(122)
    plt.plot(dist)
    plt.show()
    
    if i > 5:
        break


# Collect info

In [None]:
df["mean"] = df.fp.apply(lambda x: np.mean(read_dcm(x)[0]))

In [None]:
def standartize(img):
    return (img - img.mean()).astype(np.float) / img.std()
df["new_mean"] = df.fp.apply(lambda x: standartize(read_dcm(x)[0]).mean())

In [None]:
from collections import defaultdict

data = defaultdict(list)
for row in tqdm(df["fp"].values):
    img, vp = read_dcm(row)
    data[vp].append(np.mean(img))

In [None]:
print(data.keys())

In [None]:
bins = 100
plt.figure(figsize=(20, 5))
plt.subplot(121)
plt.hist(data["AP"], bins=bins)
plt.subplot(122)
plt.hist(data["PA"], bins=bins)
plt.show()

In [None]:
df["mean"].hist(bins=100)

In [None]:
df["new_mean"].hist(bins=100)

In [None]:
np.mean(data["PA"]), np.mean(data["AP"])

In [None]:
plt_in_row = 5
for cls in range(2):
    plt.figure(figsize=(20, 8))

    if cls == 0:
        print("Lower 149")
    else:
        print("Upper 149")
    for plt_i in range(plt_in_row):
        row = df.loc[((df["mean"] > 149) == bool(cls))].sample(1)
        fp = row["fp"].values[0]
        img, _ = read_dcm(fp)
        mean = row["mean"].values[0]
        mask = img > 0
        img_min = img[mask].min()
        img_max = img[mask].max()

        plt.subplot(2, plt_in_row, plt_i + 1)
        plt.title("mean:{:.3f} min:{} max:{}".format(mean, img_min, img_max))
        plt.imshow(img)

        img = img.astype(np.float)
        img[mask] = (img[mask] - img[mask].min()) / (img[mask].max() - img[mask].min()) * 255.0
        img[mask] = (img[mask] - img[mask].mean()).astype(np.float) / img[mask].std()
        mean = img[mask].mean()

        plt.subplot(2, plt_in_row, plt_i + 1 + plt_in_row)
        plt.title(mean)
        plt.imshow(img)
    plt.show()


# All bad mask

In [None]:
all_bad_mask = np.zeros((1024, 1024))
for fp in tqdm(df.fp.values):
    img, _ = read_dcm(fp)
    all_bad_mask[img == 0] = 1

In [None]:
df["empty_area"] = df.fp.apply(lambda x: (read_dcm(x)[0] == 0).sum())

In [None]:
def without_empty(img):
    return img[img != 0]
df["mean_without_empty"] = df.fp.apply(lambda x: without_empty(read_dcm(x)[0]).mean())

In [None]:
df.mean_without_empty.hist(bins=100)

In [None]:
df.empty_area.value_counts().sort_index()[:10]

In [None]:
df.empty_area.hist()

In [None]:
(df.empty_area > (1024 * 1024 * 0.10)).sum()

In [None]:
df.empty_area[df.empty_area > (1024 * 1024 * 0.25)].hist(bins=100)

In [None]:
for i, fp in tqdm(enumerate(df.fp.values)):
    img, _ = read_dcm(fp)
    plt.imshow(img == 0), plt.show()
    if i > 5:
        break

In [None]:
plt.imshow(all_bad_mask), plt.show();

In [None]:
lower_bound = 1024 * 1024 * 0.1
upper_bound = 1024 * 1024 * 0.001
i = 0
for fp in df[(df.empty_area >= 0) & (df.empty_area < upper_bound)]["fp"]:
    img, _ = read_dcm(fp)
    plt.subplot(121)
    plt.imshow(img == 0)
    plt.subplot(122)
    plt.imshow(img)
    plt.show()
    i += 1
    if i > 5:
        break

# Cut dataframe with images with small amount of black area

In [None]:
from utils.mask_functions import rle2mask

In [None]:
cut_df = df[(df.empty_area >= 0) & (df.empty_area < upper_bound)]

In [None]:
cut_df.shape

In [None]:
cut_df["empty"].mean()

In [None]:
def wrap_rle2mask(lst):
    shape = (1024, 1024)
    mask = np.zeros(shape, dtype=np.int)
    for rle in lst:
        mask[rle2mask(rle, *shape) > 0] = 1
    return mask

In [None]:
i = 0
for _, row in cut_df[~cut_df["empty"]].iterrows():
    fp = row["fp"]
    mask = wrap_rle2mask(row["EncodedPixels"])
    img, _ = read_dcm(fp, v=1)
    print(img.mean())
    plt.subplot(121)
    plt.imshow(img)
    plt.subplot(122)
    plt.imshow(mask)
    plt.show()
    i += 1
    if i > 5:
        break

Example of fields:  
`(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0016) SOP Class UID                       UI: Secondary Capture Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.276.0.7230010.3.1.4.8323329.10107.1517875222.121330
(0008, 0020) Study Date                          DA: '19010101'
(0008, 0030) Study Time                          TM: '000000.00'
(0008, 0050) Accession Number                    SH: ''
(0008, 0060) Modality                            CS: 'CR'
(0008, 0064) Conversion Type                     CS: 'WSD'
(0008, 0090) Referring Physician's Name          PN: ''
(0008, 103e) Series Description                  LO: 'view: AP'
(0010, 0010) Patient's Name                      PN: 'eb0f4cc8-85d7-4089-b670-fc8c65f31dc1'
(0010, 0020) Patient ID                          LO: 'eb0f4cc8-85d7-4089-b670-fc8c65f31dc1'
(0010, 0030) Patient's Birth Date                DA: ''
(0010, 0040) Patient's Sex                       CS: 'M'
(0010, 1010) Patient's Age                       AS: '49'
(0018, 0015) Body Part Examined                  CS: 'CHEST'
(0018, 5101) View Position                       CS: 'AP'
(0020, 000d) Study Instance UID                  UI: 1.2.276.0.7230010.3.1.2.8323329.10107.1517875222.121329
(0020, 000e) Series Instance UID                 UI: 1.2.276.0.7230010.3.1.3.8323329.10107.1517875222.121328
(0020, 0010) Study ID                            SH: ''
(0020, 0011) Series Number                       IS: "1"
(0020, 0013) Instance Number                     IS: "1"
(0020, 0020) Patient Orientation                 CS: ''
(0028, 0002) Samples per Pixel                   US: 1
(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'
(0028, 0010) Rows                                US: 1024
(0028, 0011) Columns                             US: 1024
(0028, 0030) Pixel Spacing                       DS: ['0.168', '0.168']
(0028, 0100) Bits Allocated                      US: 8
(0028, 0101) Bits Stored                         US: 8
(0028, 0102) High Bit                            US: 7
(0028, 0103) Pixel Representation                US: 0
(0028, 2110) Lossy Image Compression             CS: '01'
(0028, 2114) Lossy Image Compression Method      CS: 'ISO_10918_1'
(7fe0, 0010) Pixel Data                          OB: Array of 104072 elements`

In [None]:
fields_of_interest = ["Modality",
"ConversionType",
"PatientName",
"PatientID",
"PatientSex",
"PatientAge",
"BodyPartExamined",
"ViewPosition",
"SeriesNumber",
"InstanceNumber",
"PatientOrientation",
"SamplesPerPixel",
"PhotometricInterpretation",
"Rows",
"Columns",
"PixelSpacing",
"BitsAllocated",
"BitsStored",
"HighBit",
"PixelRepresentation",
"LossyImageCompression",
"LossyImageCompressionMethod"]

In [None]:
df.index = range(len(df))
new_df = defaultdict(list)
for i, fp in tqdm(enumerate(df.fp.values)):
    ds = pydicom.read_file(fp)
    for field in fields_of_interest:
        value = getattr(ds, field)
        if type(value) == pydicom.multival.MultiValue:
            value = value[0]
        new_df[field].append(value)
for field, values in new_df.items():
    df[field] = np.array(values, dtype=str)


In [None]:
# df = pd.read_csv("data_with_metadata.csv", dtype=dtypes)

In [None]:
df["PatientAge"] = df["PatientAge"].astype(np.int)

In [None]:
import pickle as pkl

In [None]:
sums = []
for mask_list in tqdm(df.EncodedPixels.values):
    if len(mask_list) == 1 and mask_list[0] == '-1':
        sums.append(0)
    else:
        sums.append(wrap_rle2mask(mask_list).sum())
df["mask_sum"] = sums

In [None]:
pkl.dump(df.dtypes.to_frame('dtypes').to_dict()["dtypes"], open("dtypes_data_with_metadata.pkl", "wb"))
df.to_csv("data_with_metadata.csv", index=False)