# Please do Vote up if you liked my work

My [Linkedin](https://www.linkedin.com/in/letian-dai-phd-physics-nanomaterial-nanoscience-nanotechnology-datascience-bigdata/) <br>
My [Git](https://github.com/daiwofei)

>  This competition will get the relation between the inital CT image (image treatement) -> the volume of the chest -> the FVC value -> the weeks and combines with the other features, such as "Age", "Sex" and "SmokingStatus". The most important and challenge step is to analyze the CT image and find the volume of the chest by the integration of each slice.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import glob
import scipy.ndimage as ndimage
from skimage import measure, morphology, segmentation

#plotly
!pip install chart_studio
import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

!pip install pydicom
#read the .dcm file
import pydicom


from scipy.stats import probplot, mode

#color
from colorama import Fore, Back, Style

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!conda install -c conda-forge gdcm -y
import gdcm

# 1. Input the data

In [None]:
# Check the list of files or folders in the data source
list(os.listdir("../input/osic-pulmonary-fibrosis-progression"))

In [None]:
# input the data
train_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

print(Fore.RED + 'Training data shape: ',Style.RESET_ALL,train_df.shape)
print(Fore.BLUE + 'Test data shape: ',Style.RESET_ALL,test_df.shape)

In [None]:
# preview of the train dataframe
train_df.head(5)

In [None]:
# preview of the test dataframe
test_df.head(5)

In [None]:
# Show the list of columns
columns = train_df.keys()
columns = list(columns)
print(Fore.BLUE + "List of columns in the train_df",Fore.RED + "", columns)

In [None]:
# check if there is missing data in the dataframe
# check the null part in the whole data set, red part is missing data, blue is non-null
sns.heatmap(train_df.isnull(),yticklabels=False,cbar=False,cmap='coolwarm')
train_df.isnull().sum()

There is no missing value in train_df and test_df.

In [None]:
# check the null part in the whole data set, red part is missing data, blue is non-null
sns.heatmap(test_df.isnull(),yticklabels=False,cbar=False,cmap='coolwarm')
test_df.isnull().sum()

In [None]:
# check the type of dataframe
train_df.info()

# 2. Exploratary Data Analysis  

In [None]:
# Check the unique number of patients' ids in train dataframe

print(Fore.WHITE + "In the train_df,",Fore.RED + "the total patient ids are",Style.RESET_ALL,f"{train_df['Patient'].count()},"
      , Fore.BLUE + "from those the unique ids are", Style.RESET_ALL, f"{train_df['Patient'].value_counts().shape[0]}.")

In [None]:
# Check the unique number of patients' ids in test dataframe

print(Fore.WHITE + "In the test_df,",Fore.RED + "the total patient ids are",Style.RESET_ALL,f"{test_df['Patient'].count()},"
      , Fore.BLUE + "from those the unique ids are", Style.RESET_ALL, f"{test_df['Patient'].value_counts().shape[0]}.")

All patients are unique in the test_df. 

In [None]:
# compare the test patients' ids and train patients' ids

train_patient_ids = set(train_df['Patient'].unique())
test_patient_ids = set(test_df['Patient'].unique())

# get the intersection of test and train datasets
test_patient_ids.intersection(train_patient_ids)

The whole 5 patients in test_df are found in train_df as well.

In [None]:
# The histogram of patients' samples (how many samples are in the same patient) 

plt.figure(figsize=(6,4))
train_df['Patient'].value_counts().hist(alpha=0.5,color='green',label='Samples for the same patient')
plt.legend()

plt.xlabel('Samples for the same patient')
plt.ylabel('Patients')

## It is necessary to organize the data of the same patient and create individual patient dataframe

In [None]:
# Let's verify the features for the same patient, for example the patient with id = ID00419637202311204720264

train_df[train_df['Patient'] == 'ID00419637202311204720264']

In [None]:
# Let's verify the features for the same patient, for example the patient with id = ID00421637202311550012437

train_df[train_df['Patient'] == 'ID00421637202311550012437']

In [None]:
# Let's verify the features for the same patient, for example the patient with id = ID00422637202311677017371

train_df[train_df['Patient'] == 'ID00422637202311677017371']

In [None]:
# Let's verify the features for the same patient, for example the patient with id = ID00423637202312137826377

train_df[train_df['Patient'] == 'ID00423637202312137826377']

In [None]:
# Let's verify the features for the same patient, for example the patient with id = ID00426637202313170790466

train_df[train_df['Patient'] == 'ID00426637202313170790466']

# From the description of data: "In the dataset, you are provided with a baseline chest CT scan and associated clinical information for a set of patients. A patient has an image acquired at time Week = 0 and has numerous follow up visits over the course of approximately 1-2 years, at which time their FVC is measured." *It is possible to get the initial FVC for each patient at the Week = 0.*

In [None]:
# verify the relation between the 'FVC' and the 'Percent' for the same patient, for example the first patient with id = ID00123637202217151272140

sns.regplot(x="FVC", y="Percent", data=train_df[train_df['Patient'] == 'ID00123637202217151272140'])

In [None]:
# get the initial FVC at the week = 0, for example the first patient with id = ID00123637202217151272140
# FVC is integer, so it is necessary to use function .round()

initial_FVC = ((train_df[train_df['Patient'] == 'ID00123637202217151272140']['FVC'].iloc[0]) / (train_df[train_df['Patient']== 'ID00123637202217151272140']['Percent'].iloc[0])*100).round() 

print('The FVC of the patient with id = ID00123637202217151272140 is',Fore.BLUE + '', initial_FVC)

In [None]:
# get the initial FVC at the week = 0, for example the first patient with id = ID00009637202177434476278
# FVC is integer, so it is necessary to use function .round()

initial_FVC = ((train_df[train_df['Patient'] == 'ID00009637202177434476278']['FVC'].iloc[0]) / (train_df[train_df['Patient']== 'ID00009637202177434476278']['Percent'].iloc[0])*100).round() 

print('The FVC of the patient with id = ID00009637202177434476278 is',Fore.BLUE + '', initial_FVC)

## It is clear to see that the features of "Age", "Sex" and "SmokingStatus" is constant for the same patient.

In [None]:
# Create individual patient dataframe
patient_df = train_df[['Patient', 'Age', 'Sex', 'SmokingStatus']].drop_duplicates()
patient_df.head()

In [None]:
# create a new row for each patient about the initial FVC at the week = 0
# # iterating the columns 
i = 0
Init_FVC = []
for row in patient_df.index: 
    ID = patient_df['Patient'].loc[row]
    temp_FVC = ((train_df[train_df['Patient'] == ID]['FVC'].iloc[0]) / (train_df[train_df['Patient']== ID]['Percent'].iloc[0])*100).round() 
    Init_FVC.append(temp_FVC)
    print(i,ID) 
    i = i+1


In [None]:
# add the initial FVC inside the patient_df 
patient_df['FVC'] = Init_FVC
patient_df.head()

In [None]:
# The corresponding Weeks is 0
patient_df['Weeks'] = 0
patient_df.head()

In [None]:
# The corresponding Percent is 100%
patient_df['Percent'] = 100
patient_df.head()

In [None]:
# check the sex elemnts in histogram
# The Histogram of sex
patient_df['Sex'].value_counts().iplot(kind='bar',yTitle='Counts',xTitle = 'Sex',linecolor='black',opacity=0.7,color='green',theme='pearl',bargap=0.5,
                                       gridcolor='white',title='Distribution of the Sex column in the Unique Patient Set')

In [None]:
# check the SmokingStatus elemnts in histogram
# The Histogram of SmokingStatus
patient_df['SmokingStatus'].value_counts().iplot(kind='bar',yTitle='Counts',xTitle = 'SmokingStatus',linecolor='black',opacity=0.7,color='red',theme='pearl',bargap=0.5,
                                       gridcolor='white',title='Distribution of the SmokingStatus column in the Unique Patient Set')

In [None]:
# check the age distribution in histogram
# The Histogram of age
plt.figure(figsize=(6,4))
patient_df['Age'].hist(alpha=0.5,color='blue',label='Age', bins = 30)
plt.legend()

plt.xlabel('Age')
plt.ylabel('Count')

In [None]:
# check the FVC distribution in histogram
# The Histogram of FVC
plt.figure(figsize=(6,4))
patient_df['FVC'].hist(alpha=0.5,color='brown',label='FVC(Week=0)', bins = 30)
plt.legend()

plt.xlabel('FVC(Week=0)')
plt.ylabel('Count')

# I need to convert the string to integer of "Sex" and "SmokingStatus"

In [None]:
# convert the "Ex-smoker" = 1, "Never smoked" = 0, "Currently smokes" = 2 in the "SmokingStatus"
Smoking_list = []
for i in np.arange(len(patient_df)):
    status = patient_df['SmokingStatus'].iloc[i]
    if status == 'Ex-smoker':
        Smoking_list.append(1)
    elif status == 'Never smoked':
        Smoking_list.append(0)
    else:
        Smoking_list.append(2)

patient_df['SmokingStatus'] = Smoking_list

In [None]:
patient_df.info()

In [None]:
# convert the "Male" = 1, "Female" = 0 in the "Sex"
Sex_list = []
for i in np.arange(len(patient_df)):
    gender = patient_df['Sex'].iloc[i]
    if gender == 'Male':
        Sex_list.append(1)
    else:
        Sex_list.append(0)

patient_df['Sex'] = Sex_list

## Because we have obtained the inital FVC at the week = 0, when the patient took the CT scan of their chest. So I need to get the relation betweeen the CT image and the corresponding FVC.  

# The test_df gives the initial state of each patient with the scan CT images in the test folder. There is a big question needs to be answered "Can we deduce the FVC via analyzing the CT images?" 

In the train and test folders, there are multiple `.dcm` files corresponding to different slices of CT scan for each patient measured at Week = 0. CT scans produce 3D volumes consist of 2D slices for the chest which should have direct relation between the FVC at the week = 0. Each scan is a 2D slice which is a `.dcm` file. 

In [None]:
#check the information of file '1.dcm' for the patient with id = ID00228637202259965313869
file_path = '../input/osic-pulmonary-fibrosis-progression/train/ID00228637202259965313869/1.dcm'
dicom_file = pydicom.dcmread(file_path)

print(f'Patient: ID00228637202259965313869 Image: 1.dcm Dataset\n{"." * 56}\n\n{dicom_file}')

In [None]:
#check the information of file '1.dcm' for the patient with id = ID00228637202259965313869
file_path = '../input/osic-pulmonary-fibrosis-progression/train/ID00228637202259965313869/2.dcm'
dicom_file = pydicom.dcmread(file_path)

print(f'Patient: ID00228637202259965313869 Image: 2.dcm Dataset\n{"." * 56}\n\n{dicom_file}')

In [None]:
#check the information of file '1.dcm' for the patient with id = ID00011637202177653955184
file_path = '../input/osic-pulmonary-fibrosis-progression/train/ID00011637202177653955184/1.dcm'
dicom_file = pydicom.dcmread(file_path)

print(f'Patient: ID00422637202311677017371 Image: 1.dcm Dataset\n{"." * 56}\n\n{dicom_file}')

In [None]:
#check the information of file '1.dcm' for the patient with id = ID00009637202177434476278
file_path = '../input/osic-pulmonary-fibrosis-progression/train/ID00009637202177434476278/1.dcm'
dicom_file = pydicom.dcmread(file_path)
dicom_file.dir()

In [None]:
# get the relation between the 'SliceThickness', 'SingleCollimationWidth' and the difference of 'ImagePosition Patient of Z' or 'SliceLocation'
patient_name = 'ID00228637202259965313869'
patient_directory = sorted(os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}')
                               , key=(lambda f: int(f.split('.')[0])))
print (len(patient_directory))

In [None]:
# get the relation between the 'SliceThickness', 'SingleCollimationWidth' and the difference of 'ImagePosition Patient of Z' or 'SliceLocation'
patient_name = 'ID00228637202259965313869'
patient_directory = sorted(os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}')
                               , key=(lambda f: int(f.split('.')[0])))
for name in patient_directory:
    eachslice = pydicom.dcmread(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}/{name}')
    print (eachslice.ImagePositionPatient[2], '  ',eachslice.SliceLocation,'   ', eachslice.SliceThickness, '   ', eachslice.SingleCollimationWidth,'    ', eachslice.TableSpeed)

In [None]:
# get the relation between the 'SliceThickness', 'SingleCollimationWidth' and the difference of 'ImagePosition Patient of Z' or 'SliceLocation'
patient_name = 'ID00422637202311677017371'
patient_directory = sorted(os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}')
                               , key=(lambda f: int(f.split('.')[0])))
for name in patient_directory:
    eachslice = pydicom.dcmread(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}/{name}')
    print (eachslice.ImagePositionPatient[2], '  ',eachslice.SliceLocation,'   ', eachslice.SliceThickness, '   ', eachslice.SingleCollimationWidth,'    ', eachslice.TableSpeed)

It is clear to see in the case of patient with id = 'ID00422637202311677017371'. <br>
The 'ImagePositionPatient[2]' is the same as the 'SliceLocation'. <br>
The 'SliceThickness' is the same as the 'SingleCollimationWidth'.

In [None]:
patient_name = 'ID00007637202177411956430'
patient_directory = sorted(os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}')
                               , key=(lambda f: int(f.split('.')[0])))
for name in patient_directory:
    eachslice = pydicom.dcmread(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}/{name}')
    print (eachslice.ImagePositionPatient[2], '  ',eachslice.SliceLocation,'   ', eachslice.SliceThickness, '   ')

In [None]:
# the first patient in the test file
patient_name = 'ID00419637202311204720264'
patient_directory = sorted(os.listdir(f'../input/osic-pulmonary-fibrosis-progression/test/{patient_name}')
                               , key=(lambda f: int(f.split('.')[0])))
print (len(patient_directory))


In [None]:
# https://www.kaggle.com/schlerp/getting-to-know-dicom-and-the-data
def show_dcm_info(dataset):
    print(Fore.YELLOW + "Filename.........:",Style.RESET_ALL,file_path)
    print()

    pat_name = dataset.PatientName
    display_name = pat_name.family_name + ", " + pat_name.given_name
    print(Fore.BLUE + "Patient's name......:",Style.RESET_ALL, display_name)
    print(Fore.BLUE + "Patient id..........:",Style.RESET_ALL, dataset.PatientID)
    print(Fore.BLUE + "Patient's Sex.......:",Style.RESET_ALL, dataset.PatientSex)
    print(Fore.YELLOW + "Modality............:",Style.RESET_ALL, dataset.Modality)
    print(Fore.GREEN + "Body Part Examined..:",Style.RESET_ALL, dataset.BodyPartExamined)
    
    if 'PixelData' in dataset:
        rows = int(dataset.Rows)
        cols = int(dataset.Columns)
        print(Fore.BLUE + "Image size.......:",Style.RESET_ALL," {rows:d} x {cols:d}, {size:d} bytes".format(
            rows=rows, cols=cols, size=len(dataset.PixelData)))
        if 'PixelSpacing' in dataset:
            print(Fore.YELLOW + "Pixel spacing....:",Style.RESET_ALL,dataset.PixelSpacing)
            dataset.PixelSpacing = [1, 1]
        plt.figure(figsize=(10, 10))
        plt.imshow(dataset.pixel_array, cmap='gray')
        plt.show()

In [None]:
patient_name = 'ID00419637202311204720264'
train_file_path = (f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}/10.dcm')
test_file_path = (f'../input/osic-pulmonary-fibrosis-progression/test/{patient_name}/10.dcm')

train_dataset = pydicom.dcmread(train_file_path)
test_dataset = pydicom.dcmread(test_file_path)



f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize = (12, 12))

ax1.imshow(train_dataset.pixel_array, cmap='gray')
ax1.set_title("Train file")
ax1.axis('off')

ax2.imshow(test_dataset.pixel_array, cmap='gray')
ax2.set_title("Test file")
ax2.axis('off')

plt.show()

# The CT files are the same for the same patient  in both train and test folders.

In [None]:
patient_name = 'ID00007637202177411956430'
patient_directory = sorted(os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}')
                               , key=(lambda f: int(f.split('.')[0])))
for name in patient_directory:
    eachslice = pydicom.dcmread(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}/{name}')
    print (eachslice.ImagePositionPatient[2], '  ', eachslice.SliceThickness)


In [None]:
patient_df2 = patient_df.copy()

def get_metadata(patient_name):
    
    patient_directory = sorted(os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}')
                               , key=(lambda f: int(f.split('.')[0])))
    first_slice = pydicom.dcmread(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}/{patient_directory[0]}')

    
    features = ['Manufacturer','TotalCollimationWidth','SingleCollimationWidth','TableSpeed','KVP','Columns','Rows','DistanceSourceToDetector',
               'DistanceSourceToPatient','GeneratorPower','HighBit','PixelRepresentation','SliceLocation','SliceThickness','TableHeight',
               'RevolutionTime','PatientPosition','XRayTubeCurrent']
    for feature in features:
        if feature in first_slice.dir():
             patient_df2.loc[patient_df2['Patient'] == patient_name, feature] = first_slice.get(feature)
    
    
    patient_df2.loc[patient_df2['Patient'] == patient_name, 'PixelSpacing'] = first_slice.PixelSpacing[0]
    
    
    if 'ImagePositionPatient' in first_slice.dir():
        patient_df2.loc[patient_df2['Patient'] == patient_name, 'ImagePositionPatient_X'] = first_slice.ImagePositionPatient[0]
        patient_df2.loc[patient_df2['Patient'] == patient_name, 'ImagePositionPatient_Y'] = first_slice.ImagePositionPatient[1]
        patient_df2.loc[patient_df2['Patient'] == patient_name, 'ImagePositionPatient_Z'] = first_slice.ImagePositionPatient[2]

In [None]:
for patient in patient_df2['Patient']:
    get_metadata(patient)

patient_df2

In [None]:
# check the Manufacturer distribution in histogram
fig = px.histogram(patient_df2, x="Manufacturer")
fig.show()

In [None]:
# check the SingleCollimationWidth distribution in histogram
fig = px.scatter(patient_df2, x="SingleCollimationWidth")
fig.show()

In [None]:
# check the TotalCollimationWidth distribution in histogram
fig = px.scatter(patient_df2, x="TotalCollimationWidth")
fig.show()

In [None]:
# check the SliceThickness distribution in histogram
fig = px.scatter(patient_df2, x="SliceThickness")
fig.show()

In [None]:
# check the RevolutionTime distribution in histogram
fig = px.histogram(patient_df2, x="RevolutionTime")
fig.show()


In [None]:
# check the TableSpeed distribution in scatter
fig = px.scatter(patient_df2, x="TableSpeed")
fig.show()

In [None]:
# check the PatientPosition distribution in scatter
fig = px.scatter(patient_df2, x="PatientPosition")
fig.show()


In [None]:
# check the SliceLocation distribution in scatter
fig = px.scatter(patient_df2, x="SliceLocation")
fig.show()

# It is necessary to get the area of the lung in each slice and intergrate to obtain the volume of the whole chest. 

One method of obtaining the area of the lung in each slice is to use the Marker-Controlled Watershed approach. https://www.kaggle.com/aadhavvignesh/lung-segmentation-by-marker-controlled-watershed

In [None]:
# https://www.kaggle.com/aadhavvignesh/lung-segmentation-by-marker-controlled-watershed
def load_scan(path):
    """
    Loads scans from a folder and into a list.
    
    Parameters: path (Folder path)
    
    Returns: slices (List of slices) 
    """
    
    slices = [pydicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: int(x.InstanceNumber))
        
    return slices

In [None]:
# https://www.kaggle.com/aadhavvignesh/lung-segmentation-by-marker-controlled-watershed
def get_pixels_hu(scans):
    """
    Converts raw images to Hounsfield Units (HU).
    
    Parameters: scans (Raw images)
    
    Returns: image (NumPy array)
    """
    
    image = np.stack([s.pixel_array for s in scans])
    image = image.astype(np.int16)

    # Since the scanning equipment is cylindrical in nature and image output is square,
    # we set the out-of-scan pixels to 0
    image[image == -2000] = 0
    
    
    # HU = m*P + b
    intercept = scans[0].RescaleIntercept
    slope = scans[0].RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
        
    image += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [None]:
INPUT_FOLDER = '/kaggle/input/osic-pulmonary-fibrosis-progression/train/'

patients = os.listdir(INPUT_FOLDER)
patients.sort()
test_patient_scans = load_scan(INPUT_FOLDER + patients[24])
test_patient_images = get_pixels_hu(test_patient_scans)

In [None]:
test_patient_scans[0]

In [None]:
plt.imshow(test_patient_images[12], cmap='gray')
plt.title("Original Slice")
plt.show()

In [None]:
# https://www.kaggle.com/aadhavvignesh/lung-segmentation-by-marker-controlled-watershed
def generate_markers(image):
    """
    Generates markers for a given image.
    
    Parameters: image
    
    Returns: Internal Marker, External Marker, Watershed Marker
    """
    
    #Creation of the internal Marker
    marker_internal = image < -400
    marker_internal = segmentation.clear_border(marker_internal)
    marker_internal_labels = measure.label(marker_internal)
    
    areas = [r.area for r in measure.regionprops(marker_internal_labels)]
    areas.sort()
    
    if len(areas) > 2:
        for region in measure.regionprops(marker_internal_labels):
            if region.area < areas[-2]:
                for coordinates in region.coords:                
                       marker_internal_labels[coordinates[0], coordinates[1]] = 0
    
    marker_internal = marker_internal_labels > 0
    
    # Creation of the External Marker
    external_a = ndimage.binary_dilation(marker_internal, iterations=10)
    external_b = ndimage.binary_dilation(marker_internal, iterations=55)
    marker_external = external_b ^ external_a
    
    # Creation of the Watershed Marker
    marker_watershed = np.zeros((512, 512), dtype=np.int)
    marker_watershed += marker_internal * 255
    marker_watershed += marker_external * 128
    
    return marker_internal, marker_external, marker_watershed

In [None]:
# https://www.kaggle.com/aadhavvignesh/lung-segmentation-by-marker-controlled-watershed
test_patient_internal, test_patient_external, test_patient_watershed = generate_markers(test_patient_images[12])

f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(15,15))

ax1.imshow(test_patient_internal, cmap='gray')
ax1.set_title("Internal Marker")
ax1.axis('off')

ax2.imshow(test_patient_external, cmap='gray')
ax2.set_title("External Marker")
ax2.axis('off')

ax3.imshow(test_patient_watershed, cmap='gray')
ax3.set_title("Watershed Marker")
ax3.axis('off')

plt.show()

# I like the `Internal Marker`. The area can be calculated by the number of `white` pixel multiple by the square of `PixelSpacing`

In [None]:
test_patient_internal

In [None]:
test_patient_internal.shape

In [None]:
test_patient_internal_array = np.ravel(test_patient_internal)

In [None]:
test_patient_internal_array.shape

In [None]:
number_white_pixel = 0
for x in test_patient_internal_array:
    if x == True:
        number_white_pixel = number_white_pixel+1
print ('The number of white pixels is ', number_white_pixel)
print ('The percentage of white pixels in the dark image is {:0.2f}'.format( number_white_pixel/262144*100), '%')

In [None]:
# the pixel spacing of patient ID00062637202188654068490
print (Fore.BLUE + "The pixel spacing of patient ID00062637202188654068490 is", Style.RESET_ALL,test_patient_scans[0].PixelSpacing[0])

In [None]:
# The area of chest in this slice

print (Fore.RED + "The area of chest in the slice 12.dcm of patient ID00062637202188654068490 is", Style.RESET_ALL,(test_patient_scans[0].PixelSpacing[0])*(test_patient_scans[0].PixelSpacing[0])*number_white_pixel)

In [None]:
# only return the internal area
def only_internal(image):
    """
    Generates markers for a given image.
    
    Parameters: image
    
    Returns: Internal Marker, External Marker, Watershed Marker
    """
    
    #Creation of the internal Marker
    marker_internal = image < -400
    marker_internal = segmentation.clear_border(marker_internal)
    marker_internal_labels = measure.label(marker_internal)
    
    areas = [r.area for r in measure.regionprops(marker_internal_labels)]
    areas.sort()
    
    if len(areas) > 2:
        for region in measure.regionprops(marker_internal_labels):
            if region.area < areas[-2]:
                for coordinates in region.coords:                
                       marker_internal_labels[coordinates[0], coordinates[1]] = 0
    
    marker_internal = marker_internal_labels > 0
    
    return marker_internal

In [None]:
# The volume of chest of the patient ID00062637202188654068490
number_white_pixel = 0

for image in test_patient_images:
    
    image_pixel = only_internal(image)
    pixel_array = np.ravel(image_pixel)
    for x in pixel_array:
        if x == True:
              number_white_pixel = number_white_pixel+1

# the thickness of chest for each slice (useful for the integration of the whole chest)
try:
        slice_thickness = np.abs(test_patient_scans[0].ImagePositionPatient[2] - test_patient_scans[1].ImagePositionPatient[2])
except:
        slice_thickness = np.abs(test_patient_scans[0].SliceLocation - test_patient_scans[1].SliceLocation)

# the area of each white pixel
eachpixelarea = test_patient_scans[0].PixelSpacing[0] * test_patient_scans[0].PixelSpacing[0]
# approximate volume of the whole 3D chest from CT images
volume_chest = eachpixelarea*slice_thickness*number_white_pixel
                
print('The whole 3D chest volume is about', volume_chest)

In [None]:
# get the 3D volume of chest
patient_df3 = patient_df.copy()

In [None]:
def get_3D_volume(patient_name):
    #path
    INPUT_FOLDER = '/kaggle/input/osic-pulmonary-fibrosis-progression/train/'
    test_patient_scans = load_scan(INPUT_FOLDER + patient_name)
    test_patient_images = get_pixels_hu(test_patient_scans)
    
    #get the volume of 3D chest from CT images
    number_white_pixel = 0
    for image in test_patient_images:
    
        image_pixel = only_internal(image)
        pixel_array = np.ravel(image_pixel)
        for x in pixel_array:
            if x == True:
                  number_white_pixel = number_white_pixel+1
    
    # the thickness of chest for each slice (useful for the integration of the whole chest)
    if 'ImagePositionPatient' in test_patient_scans[0].dir():
        slice_thickness = np.abs(test_patient_scans[0].ImagePositionPatient[2] - test_patient_scans[1].ImagePositionPatient[2])
    
    elif 'SliceLocation' in test_patient_scans[0].dir():
        slice_thickness = np.abs(test_patient_scans[0].SliceLocation - test_patient_scans[1].SliceLocation)
    
    elif 'SliceThickness' in test_patient_scans[0].dir():
        slice_thickness = test_patient_scans[0].SliceThickness
        
    else:
        slice_thickness = 1
    

    # the area of each white pixel
    eachpixelarea = test_patient_scans[0].PixelSpacing[0] * test_patient_scans[0].PixelSpacing[0]
    # approximate volume of the whole 3D chest from CT images
    volume_chest = eachpixelarea*slice_thickness*number_white_pixel
    
    #create a new feature 'ChestVolume'
    patient_df3.loc[patient_df2['Patient'] == patient_name, 'ChestVolume'] = volume_chest

In [None]:
patient_df3.to_csv('patient_df3.csv', index=False)

In [None]:
patient_df4 = pd.read_csv('patient_df3.csv')
patient_df4

In [None]:
# because the patient is too many, this process needs to be separated into many small parts
patient_df_1 = patient_df4['Patient'][:10]
patient_df_2 = patient_df4['Patient'][10:20]
patient_df_3 = patient_df4['Patient'][20:30]
patient_df_4 = patient_df4['Patient'][30:40]
patient_df_5 = patient_df4['Patient'][40:50]
patient_df_6 = patient_df4['Patient'][50:60]
patient_df_7 = patient_df4['Patient'][60:70]

In [None]:
for patient in patient_df_7:
    get_3D_volume(patient)

patient_df3

In [None]:
patient_df3[60:70]

In [None]:
# verify the relation between the 'FVC' at the week 0 and the 'ChestVolume' for the same patient, for example the first patient with id = ID00123637202217151272140

sns.regplot(x="FVC", y="ChestVolume", data=patient_df3)

In [None]:

#check the information of file '1.dcm' for the patient with id = ID00026637202179561894768
file_path = '../input/osic-pulmonary-fibrosis-progression/train/ID00052637202186188008618/1.dcm'
dicom_file = pydicom.dcmread(file_path)
dicom_file.dir()
# the thickness of chest for each slice (useful for the integration of the whole chest)
if 'SliceThickness' in dicom_file.dir():
    slice_thickness = dicom_file.SliceThickness

In [None]:
    # the thickness of chest for each slice (useful for the integration of the whole chest)
    if 'SliceThickness' in test_patient_scans[0].dir():
        slice_thickness = test_patient_scans[0].SliceThickness
        
    elif 'SliceLocation' in test_patient_scans[0].dir():
        slice_thickness = np.abs(test_patient_scans[0].SliceLocation - test_patient_scans[1].SliceLocation)
        
    elif 'ImagePositionPatient' in test_patient_scans[0].dir():
        slice_thickness = np.abs(test_patient_scans[0].ImagePositionPatient[2] - test_patient_scans[1].ImagePositionPatient[2])
       
    else:
        slice_thickness = 1

In [None]:
# https://www.kaggle.com/aadhavvignesh/lung-segmentation-by-marker-controlled-watershed
def get_pixels_hu(scans):
    """
    Converts raw images to Hounsfield Units (HU).
    
    Parameters: scans (Raw images)
    
    Returns: image (NumPy array)
    """
    
    image = np.stack([s.pixel_array for s in scans])
    image = image.astype(np.int16)

    # Since the scanning equipment is cylindrical in nature and image output is square,
    # we set the out-of-scan pixels to 0
    image[image == -2000] = 0
    
    
    # HU = m*P + b
    intercept = scans[0].RescaleIntercept
    slope = scans[0].RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
        
    image += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [None]:
!pip install virtualenv virtualenvwrapper

In [None]:
!conda install -c conda-forge gdcm -y

In [None]:
patient_name = 'ID00052637202186188008618'
INPUT_FOLDER = '/kaggle/input/osic-pulmonary-fibrosis-progression/train/'
test_patient_scans = load_scan(INPUT_FOLDER + patient_name)
import gdcm
test_patient_scans[-1].pixel_array

In [None]:
test_patient_images = get_pixels_hu(test_patient_scans)
plt.imshow(test_patient_images[12], cmap='gray')
plt.title("Original Slice")
plt.show()