In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

train_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
import os
from os import listdir

import matplotlib.pyplot as plt
%matplotlib inline

#plotly
!pip install chart_studio
import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

import seaborn as sns
sns.set(style="whitegrid")

#pydicom
import pydicom

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# Settings for pretty nice plots
plt.style.use('fivethirtyeight')
plt.show()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.groupby(['SmokingStatus']).count()['Sex'].to_frame()

In [None]:
# Null values and Data types
print('Train Set !!')
print(train_df.info())
print('-------------')
print('Test Set !!')
print(test_df.info())

In [None]:
# Total number of Patient in the dataset(train+test)
print("Total Patient in Train set: ",train_df['Patient'].count())
print("Total Patient in Test set: ",test_df['Patient'].count())

In [None]:
columns = train_df.keys()
columns = list(columns)
print(columns)

In [None]:
train_df['SmokingStatus'].value_counts()

In [None]:
train_df['SmokingStatus'].value_counts(normalize=True).iplot(kind='bar',
                                                      yTitle='Percentage', 
                                                      linecolor='black', 
                                                      opacity=0.7,
                                                      color='red',
                                                      theme='pearl',
                                                      bargap=0.8,
                                                      gridcolor='white',
                                                     
                                                      title='Distribution of the SmokingStatus column in the training set')

In [None]:
train_df['Weeks'].value_counts().sort_values().iplot(kind='barh',
                                                      xTitle='Counts(Weeks)', 
                                                      linecolor='black', 
                                                      opacity=0.7,
                                                      color='#FB8072',
                                                      theme='pearl',
                                                      bargap=0.2,
                                                      gridcolor='white',
                                                      title='Distribution of the Weeks in the training set')

In [None]:
z=train_df.groupby(['SmokingStatus','Weeks'])['FVC'].count().to_frame().reset_index().head()
z.style.background_gradient(cmap='Reds') 

In [None]:
train_df['FVC'].value_counts().iplot(kind='barh',
                                      xTitle='Lung Capacity(ml)', 
                                      linecolor='black', 
                                      opacity=0.8,
                                      color='#FB8072',
                                      bargap=0.5,
                                      gridcolor='white',
                                      title='Distribution of the FVC in the training set')

In [None]:
train_df['Percent'].value_counts()

In [None]:
train_df['Percent'].iplot(kind='hist',bins=30,color='blue',xTitle='Percent distribution',yTitle='Count')

In [None]:
train_df['Age'].iplot(kind='hist',bins=30,color='red',xTitle='Age distribution',yTitle='Count')

In [None]:
plt.figure(figsize=(16, 6))
sns.kdeplot(train_df.loc[train_df['SmokingStatus'] == 'Ex-smoker', 'Age'], label = 'Ex-smoker',shade=True)
sns.kdeplot(train_df.loc[train_df['SmokingStatus'] == 'Never smoked', 'Age'], label = 'Never smoked',shade=True)
sns.kdeplot(train_df.loc[train_df['SmokingStatus'] == 'Currently smokes', 'Age'], label = 'Currently smokes',shade=True)

# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');


In [None]:
plt.figure(figsize=(16, 6))
sns.kdeplot(train_df.loc[train_df['Sex'] == 'Male', 'Age'], label = 'Male',shade=True)
sns.kdeplot(train_df.loc[train_df['Sex'] == 'Female', 'Age'], label = 'Female',shade=True)
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');

In [None]:
train_df['Sex'].value_counts()

In [None]:
plt.figure(figsize=(16, 6))
a = sns.countplot(data=train_df, x='SmokingStatus', hue='Sex')

for p in a.patches:
    a.annotate(format(p.get_height(), ','), 
           (p.get_x() + p.get_width() / 2., 
            p.get_height()), ha = 'center', va = 'center', 
           xytext = (0, 4), textcoords = 'offset points')

plt.title('Gender split by SmokingStatus', fontsize=16)
sns.despine(left=True, bottom=True);

In [None]:
def plot_pixel_array(dataset, figsize=(5,5)):
    plt.figure(figsize=figsize)
    plt.grid(False)
    plt.imshow(dataset.pixel_array, cmap='gray') # cmap=plt.cm.bone)
    plt.show()

In [None]:
print('Train .dcm number of images:', len(list(os.listdir('../input/osic-pulmonary-fibrosis-progression/train'))), '\n' +
      'Test .dcm number of images:', len(list(os.listdir('../input/osic-pulmonary-fibrosis-progression/test'))), '\n' +
      '--------------------------------', '\n' +
      'There is the same number of images as in train/ test .csv datasets')

In [None]:
# https://www.kaggle.com/schlerp/getting-to-know-dicom-and-the-data
def show_dcm_info(dataset):
    print("Filename.........:", file_path)

    pat_name = dataset.PatientName
    display_name = pat_name.family_name + ", " + pat_name.given_name
    print("Patient's name......:", display_name)
    
    print(dataset.data_element("ImageOrientationPatient"))
    print(dataset.data_element("ImagePositionPatient"))
    print(dataset.data_element("PatientID"))
    print(dataset.data_element("PatientName"))
    print(dataset.data_element("PatientSex"))
   
    
    if 'PixelData' in dataset:
        rows = int(dataset.Rows)
        cols = int(dataset.Columns)
        print("Image size.......: {rows:d} x {cols:d}, {size:d} bytes".format(
            rows=rows, cols=cols, size=len(dataset.PixelData)))
        if 'PixelSpacing' in dataset:
            print("Pixel spacing....:", dataset.PixelSpacing)

In [None]:
i = 1
num_to_plot = 2
for folder_name in os.listdir('../input/osic-pulmonary-fibrosis-progression/train/'):
        patient_path = os.path.join('../input/osic-pulmonary-fibrosis-progression/train/',folder_name)
        
        for i in range(1, num_to_plot+1):     
            file_path = os.path.join(patient_path, str(i) + '.dcm')

            dataset = pydicom.dcmread(file_path)
            show_dcm_info(dataset)
            plot_pixel_array(dataset)

        break

In [None]:
# https://www.kaggle.com/yeayates21/osic-simple-image-eda

imdir = "/kaggle/input/osic-pulmonary-fibrosis-progression/train/ID00123637202217151272140"
print("total images for patient ID00123637202217151272140: ", len(os.listdir(imdir)))

# view first (columns*rows) images in order
w=10
h=10
fig=plt.figure(figsize=(12, 12))
columns = 4
rows = 5
imglist = os.listdir(imdir)
for i in range(1, columns*rows +1):
    filename = imdir + "/" + str(i) + ".dcm"
    ds = pydicom.dcmread(filename)
    fig.add_subplot(rows, columns, i)
    plt.imshow(ds.pixel_array, cmap='gray')
plt.show()

In [None]:
# source: https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/154658
folder='train'
PATH='../input/osic-pulmonary-fibrosis-progression/'

last_index = 2

column_names = ['image_name', 'dcm_ImageOrientationPatient', 
                'dcm_ImagePositionPatient', 'dcm_PatientID',
                'dcm_PatientName', 'dcm_PatientSex'
                'dcm_rows', 'dcm_columns']

def extract_DICOM_attributes(folder):
    patients_folder = list(os.listdir(os.path.join(PATH, folder)))
    df = pd.DataFrame()
    
    i = 0
    
    for patient_id in patients_folder:
   
        img_path = os.path.join(PATH, folder, patient_id)
        
        print(img_path)
        
        images = list(os.listdir(img_path))
        
        #df = pd.DataFrame()

        for image in images:
            image_name = image.split(".")[0]

            dicom_file_path = os.path.join(img_path,image)
            dicom_file_dataset = pydicom.read_file(dicom_file_path)
                
            '''
            print(dicom_file_dataset.dir("pat"))
            print(dicom_file_dataset.data_element("ImageOrientationPatient"))
            print(dicom_file_dataset.data_element("ImagePositionPatient"))
            print(dicom_file_dataset.data_element("PatientID"))
            print(dicom_file_dataset.data_element("PatientName"))
            print(dicom_file_dataset.data_element("PatientSex"))
            '''
            
            imageOrientationPatient = dicom_file_dataset.ImageOrientationPatient
            #imagePositionPatient = dicom_file_dataset.ImagePositionPatient
            patientID = dicom_file_dataset.PatientID
            patientName = dicom_file_dataset.PatientName
            patientSex = dicom_file_dataset.PatientSex
        
            rows = dicom_file_dataset.Rows
            cols = dicom_file_dataset.Columns
            
            #print(rows)
            #print(columns)
            
            temp_dict = {'image_name': image_name, 
                                    'dcm_ImageOrientationPatient': imageOrientationPatient,
                                    #'dcm_ImagePositionPatient':imagePositionPatient,
                                    'dcm_PatientID': patientID, 
                                    'dcm_PatientName': patientName,
                                    'dcm_PatientSex': patientSex,
                                    'dcm_rows': rows,
                                    'dcm_columns': cols}


            df = df.append([temp_dict])
            
        i += 1
        
        if i == last_index:
            break
            
    return df

In [None]:
extract_DICOM_attributes('train')

In [None]:
import pandas_profiling as pdp

In [None]:
train_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
profile_train_df = pdp.ProfileReport(train_df)

In [None]:
profile_train_df

In [None]:
profile_test_df = pdp.ProfileReport(test_df)

In [None]:
profile_test_df

In [None]:
!ls /kaggle/input/osic-pulmonary-fibrosis-progression/

In [None]:
sub   = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

In [None]:
for i in range(len(test_df)):
    sub.loc[sub['Patient_Week'].str.contains(test_df.Patient[i]), 'FVC'] = test_df.FVC[i]
    
sub.to_csv('submission.csv', index=False)