In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import chain
from skimage import io
import scipy.stats

In [None]:
## Below is some helper code to read all of your full image filepaths into a dataframe for easier manipulation
#Load NIH data
all_xray_df = pd.read_csv('/data/Data_Entry_2017.csv')
all_xray_df.head(5)

# Load 'sample_labels.csv' data for pixel level assessments
sample_df = pd.read_csv('sample_labels.csv')
sample_df.head(5)

EDA is open-ended, and it is up to you to decide how to look at different ways to slice and dice your data. A good starting point is to look at the requirements for the FDA documentation in the final part of this project to guide (some) of the analyses you do. 

This EDA should also help to inform you of how pneumonia looks in the wild. E.g. what other types of diseases it's commonly found with, how often it is found, what ages it affects, etc. 

Note that this NIH dataset was not specifically acquired for pneumonia. So, while this is a representation of 'pneumonia in the wild,' the prevalence of pneumonia may be different if you were to take only chest x-rays that were acquired in an ER setting with suspicion of pneumonia. 

Also, **describe your findings and how will you set up the model training based on the findings.**

In [None]:
#function to plot gender demographics
def gender(df):
    df['Patient Gender'].value_counts().plot(kind='bar')
    plt.xlabel('Gender')
    plt.ylabel('Number of People')
    plt.title('Gender Distribution in Dataset')
    
    return df['Patient Gender'].value_counts()

In [None]:
#invoking gender distribution fnc
gender_distribution = gender(all_xray_df)
gender_distribution


In [None]:
#There is a very slight imbalance of genders with 63340 male and 48780 female

In [None]:
#There seem to be some unrealistic patient ages
all_xray_df[all_xray_df['Patient Age']>100]['Patient Age']

In [None]:
#set these values to NaN
all_xray_df.replace(all_xray_df[all_xray_df['Patient Age']>100]['Patient Age'].values,np.nan, inplace = True)

In [None]:
#confirm that no unrealistic values remain
all_xray_df[all_xray_df['Patient Age']>100]['Patient Age']

In [None]:
#defining age distribution fnc
def age(df):
    plt.hist(df['Patient Age'], bins = 10,)
    plt.xlabel('age')
    plt.ylabel('Number of People')
    plt.title('Age Distribution')

In [None]:
#invoking age distributon fnc
age(all_xray_df)

In [None]:
#Dataset contains samples of ages ranging from 2 all the way to almost 100, with a a majority around the 50-60 yr old age

In [None]:
# def fnc to show position distribution
def image_pos(df):
    df['View Position'].value_counts().plot(kind='bar')
    plt.xlabel('Image Position')
    plt.ylabel('Number of People')
    plt.title('Image Position Distribution')
    
    return df['View Position'].value_counts()

In [None]:
#invoking position distribution fnc
image_position_distribution = image_pos(all_xray_df)
image_position_distribution

In [None]:
#Dataset contains both PA and AP image position

In [None]:
#split the finding labels into individual columns
def split_labels(df):
    labels = np.unique(list(chain(*df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
    for i in labels:
        df[i] = df['Finding Labels'].map(lambda y: 1.0 if i in y else 0)

In [None]:
#invoke function to generate columns for individual diseases
split_labels(all_xray_df)

In [None]:
all_xray_df.shape

In [None]:
#function plots distribution of +ve/-ve pneumonia cases. 1 is positive, 0 is negative
def pneumonia_cases(df):
    df['Pneumonia'].value_counts().plot(kind='bar')
    plt.xlabel('Presence of Pneumonia(0 is negative 1 is postive)')
    plt.ylabel('Number of People')
    plt.title('Presence of Pneumonia in the Dataset')
    
    return (df['Pneumonia'].value_counts()[0], df['Pneumonia'].value_counts()[1])

In [None]:
#invoke function to calculate no. of pneumonia cases
negative_pneumonia, positive_pneumonia = pneumonia_cases(all_xray_df)

In [None]:
#Dataset has 1431 cases of pneumonia
positive_pneumonia

In [None]:
#ratio of +ve/-ve cases is 1.3%
ratio_pneumonia_cases = positive_pneumonia/negative_pneumonia*100
ratio_pneumonia_cases

In [None]:
#getting labels of different possible diseases
labels = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
labels = labels.tolist()

labels

In [None]:
all_xray_df[labels].sum().plot(kind = 'bar')
plt.xlabel('Diseases or No finding')
plt.ylabel('Number of People')
plt.title('Distribution of Diseases in the Dataset')
plt.show()

In [None]:
# No Finding is by far the most common diagnosis, followed my infiltration and effusion

In [None]:
#determine the distribution of diseases comorbid with Pneumonia. Plotting the top 30 combinations
all_xray_df[all_xray_df.Pneumonia == 1]['Finding Labels'].value_counts()[0:30].plot(kind = 'bar')
plt.xlabel('Diseases Comorbid with Pneumonia')
plt.ylabel('Number of People')
plt.title('Distribution of Diseases comorbid with Pneumonia')
disease_conjunction_pneumonia = all_xray_df[all_xray_df.Pneumonia == 1]['Finding Labels'].value_counts()

In [None]:
#Most common comorbodies are 'infiltration', 'edema', 'atelectasis', 'effusion', 'consolidation'