In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Pulmonary embolism**

Pulmonary embolism is a blockage in one of the pulmonary arteries in your lungs. In most cases, pulmonary embolism is caused by blood clots that travel to the lungs from deep veins in the legs or, rarely, from veins in other parts of the body (deep vein thrombosis).

Because the clots block blood flow to the lungs, pulmonary embolism can be life-threatening. However, prompt treatment greatly reduces the risk of death. Taking measures to prevent blood clots in your legs will help protect you against pulmonary embolism

# **Symptoms**

Pulmonary embolism symptoms can vary greatly, depending on how much of your lung is involved, the size of the clots, and whether you have underlying lung or heart disease.


Common signs and symptoms include:

Shortness of breath. This symptom typically appears suddenly and always gets worse with exertion.
Chest pain. You may feel like you're having a heart attack. The pain is often sharp and felt when you breathe in deeply, often stopping you from being able to take a deep breath. It can also be felt when you cough, bend or stoop.
Cough. The cough may produce bloody or blood-streaked sputum.
Other signs and symptoms that can occur with pulmonary embolism include:

Rapid or irregular heartbeat
Lightheadedness or dizziness
Excessive sweating
Fever
Leg pain or swelling, or both, usually in the calf caused by a deep vein thrombosis
Clammy or discolored skin (cyanosis)

In [None]:
import numpy as np
import pydicom
import os
import matplotlib.pyplot as plt
from glob import glob
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
import scipy.ndimage
from skimage import morphology
from skimage import measure
from skimage.transform import resize
from sklearn.cluster import KMeans
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
from plotly.graph_objs import *
init_notebook_mode(connected=True)
import pandas as pd
from tqdm import tqdm
import seaborn as sns

In [None]:
df_train = pd.read_csv('/kaggle/input/rsna-str-pulmonary-embolism-detection/train.csv')
df_test = pd.read_csv('/kaggle/input/rsna-str-pulmonary-embolism-detection/test.csv')

PATH = "../input/rsna-str-pulmonary-embolism-detection/"
TRAIN_PATH = PATH + "train/"
TEST_PATH = PATH + "test/"
sub = pd.read_csv(PATH + "sample_submission.csv")
train_image_file_paths = glob(TRAIN_PATH + '/*/*/*.dcm')
test_image_file_paths = glob(TEST_PATH + '/*/*/*.dcm')

In [None]:
df_train.head(5)

We see that there are 16 fields in total among them the first three are the identifiers (IDs). Here the first one is the StudyInstanceUID, SeriesInstanceUID and SOPInstanceUID are strings and the rest 13 are int64 which are essentially boolian data. Now let's have a look at the actual data itself.

In [None]:
df_test.head()

# Data fields


•	StudyInstanceUID - unique ID for each study (exam) in the data.

•	SeriesInstanceUID - unique ID for each series within the study.

•	SOPInstanceUID - unique ID for each image within the study (and data).

•	pe_present_on_image - image-level, notes whether any form of PE is present on the image.

•	negative_exam_for_pe - exam-level, whether there are any images in the study that have PE present.

•	qa_motion - informational, indicates whether radiologists noted an issue with motion in the study.

•	qa_contrast - informational, indicates whether radiologists noted an issue with contrast in the study.

•	flow_artifact - informational

•	rv_lv_ratio_gte_1 - exam-level, indicates whether the RV/LV ratio present in the study is >= 1

•	rv_lv_ratio_lt_1 - exam-level, indicates whether the RV/LV ratio present in the study is < 1

•	leftsided_pe - exam-level, indicates that there is PE present on the left side of the images in the study

•	chronic_pe - exam-level, indicates that the PE in the study is chronic

•	true_filling_defect_not_pe - informational, indicates a defect that is NOT PE

•	rightsided_pe - exam-level, indicates that there is PE present on the right side of the images in the study

•	acute_and_chronic_pe - exam-level, indicates that the PE present in the study is both acute AND chronic

•	central_pe - exam-level, indicates that there is PE present in the center of the images in the study

•	indeterminate -exam-level, indicates that while the study is not negative for PE, an ultimate set of exam-

•	level labels could not be created, due to QA issues


The DICOM files contains a lot of infomation in addition to the raw pixel values. If you want to have a in depth look inside reading dicom files, feel free to search on google.

In [None]:
df_train.shape


In [None]:
df_test.shape

# File descriptions in the given dataset is as follows:

test - all test images directory

train - all train images directory (note that your submission kernels will NOT have access to this set of images, so you must build your models elsewhere and incorporate them into your submissions)

sample_submission.csv - contains rows for each UID+label combination that requires a prediction. Therefore it has a row for each image (for which you will be predicting the existence of a pulmonary embolism within the image) and row for each study+label that requires a study-level prediction.

train.csv - contains UIDs and all labels.

test.csv - contains UIDs

In [None]:
sample_submission = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/sample_submission.csv")
sample_submission.head()

# **Submission size calculation for correctness:**

Here among the labels given in the training data, pe_present_on_image is the image level feature that needs to be predicted for all the images.


And the rest of the features will be predicted for only the exam. In that case each exam will have multiple images. But for that whole group we will submit only one set of prediction for those following labels:

•	negative_exam_for_pe

•	rv_lv_ratio_gte_1

•	rv_lv_ratio_lt_1

•	leftsided_pe

•	chronic_pe

•	rightsided_pe

•	acute_and_chronic_pe

•	central_pe

•	indeterminate


Here for each of the image we must have to predict the property pe_present_on_image which actually indicates wherther Pulmonary Embolism (PE) is present in the image.

In [None]:
x = df_train.pe_present_on_image.value_counts()

x.plot(kind='barh')
#x.label('pe_present_on_image')

In [None]:
# Draw a pie chart about pe_present_on_image.
plt.pie(df_train["pe_present_on_image"].value_counts(),labels=["0","1"],autopct="%.1f%%")
plt.title("Ratio of pe_present_on_image")
plt.show()

In [None]:
# Draw a pie chart about negative_exam_for_pe.
plt.pie(df_train["negative_exam_for_pe"].value_counts(),labels=["0","1"],autopct="%.1f%%")
plt.title("Ratio of negative_exam_for_pe")
plt.show()

In [None]:
x = df_train.negative_exam_for_pe.value_counts()
print(x)
x.plot(kind='barh')

In [None]:
dcm_file

Here among these large number of parameter, there are several paramers that a radiologist must have a good understanding of. Some of those parameters are listed below.

Field Code	Variable Name	Value

(0020, 0013)	Instance Number	IS: "40"

(0028, 0030)	Pixel Spacing	DS: [0.871094,0.871094]

(0028, 1050)	Window Center	DS: "40.0"

(0028, 1051)	Window Width	DS: "400.0"

(0028, 1052)	Rescale Intercept	DS: "-1024.0"

(0028, 1053)	Rescale Slope	DS: "1.0"

These parameters are must needed for preprocessing the CT-Scans. Without these parameters it will be very difficult to fully utilize the potential of the CT-Scans.

In [None]:
fig, ax = plt.subplots(2,1,figsize=(20,10))
for file in train_image_file_paths[0:10]:
    dataset = pydicom.read_file(file)
    image = dataset.pixel_array.flatten()
    rescaled_image = image * dataset.RescaleSlope + dataset.RescaleIntercept
    sns.distplot(image.flatten(), ax=ax[0]);
    sns.distplot(rescaled_image.flatten(), ax=ax[1])
ax[0].set_title("Raw pixel array distributions for 10 examples");

In [None]:
# View the correlation heat map
corr_mat = df_train.corr(method='pearson')
sns.heatmap(corr_mat,
            vmin=-1.0,
            vmax=1.0,
            center=0,
            annot=True, # True:Displays values in a grid
            fmt='.1f',
            xticklabels=corr_mat.columns.values,
            yticklabels=corr_mat.columns.values
           )
plt.show()