# EDA (Exploritory Data Analysis)
## OSIC Pulmonary Fibrosis Progression Competition

In [None]:
!pip install git+https://github.com/fastai/fastai2

### Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from os import listdir
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px

from fastai2.basics           import *
from fastai2.medical.imaging  import *

import pydicom
import matplotlib.pyplot as plot

print('Done!')

In [None]:
base_dir = '../input/osic-pulmonary-fibrosis-progression'
base_path = Path('../input/osic-pulmonary-fibrosis-progression')

#Can use DICOM metadata explored below (do for each patient) to look at average, max, and min deviation
#of pixel lightness/darkness, find outliers that may be useful or not
train_sample_path = base_path/'train'
#.ls() is a PyPI method to replace default .dir() method
fns_trn = base_path.ls()
trn_short = fns_trn[:5]

#Work on printing the DICOM images using pyplot

Are certain patients over or under-represented in the dataset? How many 

In [None]:
df_trn = pd.DataFrame()
dcm_per_patient = []

#Gets all DICOM metadata
for folder in trn_short:
    folder_contents = folder.ls()
    num_of_dcms = len(folder_contents)
    dcm_per_patient.append(num_of_dcms)
    folder_df = pd.DataFrame.from_dicoms(folder_contents, px_summ=False)
    df_trn = df_trn.append(folder_df, ignore_index=True)

In [None]:
#total number of patients   
print('There are' + str(len(dcm_per_patient)) + ' total patients.')
#mean
print(sum(dcm_per_patient)/len(dcm_per_patient))
#median
print(dcm_per_patient[round(len(dcm_per_patient)/2)])

dcm_plt = plt.figure(figsize = (16,8))
dcm_subplt1 = plt.subplot(1, 2, 1, title='Distribution of DICOM files per patient', xlabel='Number of Files', ylabel='Number of Patients')
dcm_subplt2 = plt.subplot(1, 2, 2, title='Patients with Less than 100 DICOM files', xlabel='Number of Files', ylabel='Number of Patients')
dcm_subplt1.hist(dcm_per_patient)
dcm_subplt2.hist(dcm_per_patient, bins=[0,10,20,30,40,50,60,70,80,90,100])
dcm_subplt2.set_xlim([0,100])

print("There are x patients with less than 100 DICOM files.")
print("There are x patients with greater than 500 DICOM files.")

In [None]:
train_path = Path('../input/osic-pulmonary-fibrosis-progression/train.csv')
train_csv = pd.read_csv(train_path)
train_csv.pivot_table(index='Patient')

patient_ids = train_csv['Patient'].unique()

In [None]:
categorical_df = pd.DataFrame()
for id_number in patient_ids:
    patient_info = train_csv.loc[train_csv['Patient'] == id_number]
    patient_sex = patient_info.iloc[0]
    categorical_df = categorical_df.append(patient_sex)

In [None]:
fig = px.sunburst(
    categorical_df,
    path=['Sex','SmokingStatus'],
    color_discrete_sequence=["#247BA0", "#70C1B3"],
    title='Percentage of Patients by Sex and Smoking Status')
fig.update_traces(textinfo='label+percent parent')
fig.show()


In [None]:
def load_scans(dcm_path):
    slices = [pydicom.dcmread(dcm_path + "/" + file) for file in listdir(dcm_path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    return slices

sample = base_dir + '/train/' + patient_ids[0]
dcms = load_scans(sample)

In [None]:
fig, ax = plt.subplots(1,4,figsize=(20,3))
ax[0].set_title("Original CT-scan")
ax[0].imshow(dcms[0].pixel_array, cmap="bone")
ax[1].set_title("Pixelarray distribution")
ax[1].hist(dcms[0].pixel_array.flatten())