In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
import matplotlib.gridspec as gridspec
import matplotlib.ticker as ticker
sns.set_style('whitegrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from keras.layers import Input
from keras.models import Model
from keras.layers import Dense
from keras.optimizers import Adam
#from generator import DataGenerator
import keras

In [None]:
all_xray_df = pd.read_csv('../input/data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('..', 'input','data', 'images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])

all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df['path'] = all_xray_df['path'].astype(str)

#all_xray_df['Patient Age'] = all_xray_df['Patient Age'].map(lambda x: int(x[:-1]))
all_xray_df.sample(3)

In [None]:
label_counts = all_xray_df['Finding Labels'].value_counts()[:15]
fig, ax1 = plt.subplots(1,1,figsize = (12, 8))
ax1.bar(np.arange(len(label_counts))+0.5, label_counts)
ax1.set_xticks(np.arange(len(label_counts))+0.5)
_ = ax1.set_xticklabels(label_counts.index, rotation = 90)

In [None]:
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('Fibrosis', ''))
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('Consolidation', ''))
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('Pneumonia', ''))
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('Emphysema', ''))
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('Pleural_Thickening', ''))
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('Pneumothorax', ''))
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('Hernia', ''))
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].astype(str)
#all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('Hernia', ''))

from itertools import chain
all_labels = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        all_xray_df[c_label] = all_xray_df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)
all_xray_df.sample(3)

In [None]:
# keep at least 1000 cases
MIN_CASES = 10
all_labels = [c_label for c_label in all_labels if all_xray_df[c_label].sum()>MIN_CASES]
print('Clean Labels ({})'.format(len(all_labels)), 
      [(c_label,int(all_xray_df[c_label].sum())) for c_label in all_labels])

In [None]:
# since the dataset is very unbiased, we can resample it to be a more reasonable collection
# weight is 0.1 + number of findings
sample_weights = all_xray_df['Finding Labels'].map(lambda x: len(x.split('|')) if len(x)>0 else 0).values + 4e-2
sample_weights /= sample_weights.sum()
all_xray_df = all_xray_df.sample(40000, weights=sample_weights)

label_counts = all_xray_df['Finding Labels'].value_counts()[:15]
fig, ax1 = plt.subplots(1,1,figsize = (12, 8))
ax1.bar(np.arange(len(label_counts))+0.5, label_counts)
ax1.set_xticks(np.arange(len(label_counts))+0.5)
_ = ax1.set_xticklabels(label_counts.index, rotation = 90)

In [None]:
label_counts = 100*np.mean(all_xray_df[all_labels].values,0)
fig, ax1 = plt.subplots(1,1,figsize = (12, 8))
ax1.bar(np.arange(len(label_counts))+0.5, label_counts)
ax1.set_xticks(np.arange(len(label_counts))+0.5)
ax1.set_xticklabels(all_labels, rotation = 90)
ax1.set_title('Adjusted Frequency of Diseases in Patient Group')
_ = ax1.set_ylabel('Frequency (%)')

In [None]:
all_xray_df['disease_vec'] = all_xray_df.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])

In [None]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_xray_df, 
                                   test_size = 0.20, 
                                   random_state = 2018,
                                   stratify = all_xray_df['Finding Labels'].map(lambda x: x[:4])
                                     )
print('train', train_df.shape[0], 'validation', valid_df.shape[0])

In [None]:
valid_df['newLabel'] = valid_df.apply(lambda x: x['Finding Labels'].split('|'), axis=1)
train_df['newLabel'] = train_df.apply(lambda x: x['Finding Labels'].split('|'), axis=1)

In [None]:
from keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (224, 224)
core_idg = ImageDataGenerator()
#core_idg = ImageDataGenerator(samplewise_center=True, 
 #                             samplewise_std_normalization=True, 
  #                            horizontal_flip = True, 
   #                           vertical_flip = False, 
    #                          height_shift_range= 0.05, 
     #                         width_shift_range=0.1, 
      #                        rotation_range=5, 
       #                       shear_range = 0.1,
        #                      zoom_range=0.15)

In [None]:
train_gen = core_idg.flow_from_dataframe(dataframe=train_df, 
                             directory=None,
                             x_col = 'path',
                            y_col = 'newLabel', 
                            classes = all_labels,
                            target_size = IMG_SIZE,
                             color_mode = 'rgb',
                            batch_size = 16)

valid_gen = core_idg.flow_from_dataframe(dataframe=valid_df, 
                             directory=None,
                             x_col = 'path',
                            y_col = 'newLabel', 
                            classes = all_labels,
                            target_size = IMG_SIZE,
                             color_mode = 'rgb',
                            batch_size = 32) # we can use much larger batches for evaluation

test_X, test_Y = next(core_idg.flow_from_dataframe(dataframe=valid_df, 
                             directory=None,
                             x_col = 'path',
                            y_col = 'newLabel', 
                            classes = all_labels,
                            target_size = IMG_SIZE,
                             color_mode = 'rgb',
                            batch_size = 8000))

In [None]:
t_x, t_y = next(train_gen)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(t_x, t_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0])
    c_ax.set_title(', '.join([n_class for n_class, n_score in zip(all_labels, c_y) 
                             if n_score>0.5]))
    c_ax.axis('off')

In [None]:
from keras.applications.densenet import DenseNet121, preprocess_input
#from keras.applications.nasnet  import NASNetMobile, preprocess_input
#from keras.applications.densenet import DenseNet201, preprocess_input

In [None]:
# dense net model
img_in = Input(t_x.shape[1:])              #input of model 
model = DenseNet121(include_top= False , # remove  the 3 fully-connected layers at the top of the network
                weights=None,      # pre train weight 
                input_tensor= img_in, 
                input_shape= t_x.shape[1:],
                pooling ='avg') 

x = model.output  
predictions = Dense(len(all_labels), activation="sigmoid", name="predictions")(x)    # fuly connected layer for predict class 
model = Model(inputs=img_in, outputs=predictions)

In [None]:
optimizer = Adam(lr=0.001)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=[keras.metrics.binary_accuracy])
model.load_weights('../input/chestxray8-dataframe/pretrained_model.h5')

In [None]:
# set up a checkpoint for model training
# https://keras.io/callbacks/
from keras.callbacks import ModelCheckpoint

checkpointer = ModelCheckpoint(filepath='weights.best.{epoch:02d}-{val_loss:.2f}.hdf5', verbose=1, save_best_only = True)
callbacks_list = [checkpointer]

In [None]:
history = model.fit_generator(train_gen, 
                                  steps_per_epoch=100,
                                  validation_steps=1,
                                  validation_data = valid_gen, 
                                  epochs = 1 , callbacks = callbacks_list)

In [None]:
history = model.fit_generator(train_gen, 
                                  steps_per_epoch=100,
                                  validation_steps=1,
                                  validation_data = valid_gen, 
                                  epochs = 10 , callbacks = callbacks_list)

In [None]:
#########################################
y_pred = model.predict(test_X) 

In [None]:
#########################################
# look at how often the algorithm predicts certain diagnoses 
for c_label, p_count, t_count in zip(all_labels, 
                                     100*np.mean(y_pred,0), 
                                     100*np.mean(test_Y,0)):
    print('%s: actual: %2.2f%%, predicted: %2.2f%%' % (c_label, t_count, p_count))

In [None]:
#########################################
from sklearn.metrics import roc_curve, auc
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:,idx].astype(int), y_pred[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
c_ax.legend()
plt.plot([0, 1], [0, 1], 'k--')
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')
fig.savefig('trained_net.png')

In [None]:
#########################################
from sklearn.metrics import roc_auc_score
roc_auc_score(test_Y.astype(int), y_pred)

In [None]:
########################################
from sklearn.metrics import confusion_matrix
y_pred =np.argmax(y_pred,axis=1)
cm=confusion_matrix(np.argmax(test_Y, axis=1), y_pred)
#########################################
import seaborn as sn
import pandas as pd

plt.figure(figsize=(14, 14))
ax= plt.subplot()

df_cm = pd.DataFrame(cm, range(14), range(14))
# plt.figure(figsize=(10,7))
sn.set(font_scale=1.2) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 15}, cmap='Blues',ax = ax ) # font size
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass',
                          'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'],rotation=30, ha="right",rotation_mode="anchor"); 
ax.yaxis.set_ticklabels(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass',
                          'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'],rotation=30,  ha="right", rotation_mode="anchor");


plt.show()

In [None]:
#########################################
from sklearn.metrics import accuracy_score
accuracy_score(np.argmax(test_Y, axis=1), y_pred)

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from operator import itemgetter
from collections import OrderedDict

from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt

import torch
from torch import optim,nn
import torch.nn.functional as F
from torchvision import transforms as T,models
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.utils import make_grid

pd.options.plotting.backend = "plotly"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

pathology_list =['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass',
                          'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

#trainset, validset, testset = random_split(train_gen, [5000,303,303])

def class_accuracy(dataloader, model):
    
    per_class_accuracy = [0 for i in range(len(pathology_list))]
    total = 0.0
    
    with torch.no_grad():
        
        for images,labels in dataloader:
            
            ps = model(images)
            labels = labels
            ps = (ps >= 0.5)
        
            for i in range(ps.shape[1]):
                
                x1 = ps[:,i:i+1]
                x2 = labels[:,i:i+1]
                per_class_accuracy[i] += int((x1 == x2).sum())
                
        per_class_accuracy = [(i/len(dataloader.dataset))*100.0 for i in per_class_accuracy]
        
    return per_class_accuracy     


def get_acc_data(class_names,acc_list):
    df = pd.DataFrame(list(zip(class_names, acc_list)), columns =['Labels', 'Acc']) 
    return df 

print("Train Dataset Accuracy Report")
acc_list = class_accuracy(train_gen,model)
get_acc_data(pathology_list,acc_list)

history = model.fit_generator(train_gen, 
                                  steps_per_epoch=100,
                                  validation_steps=1,
                                  validation_data = valid_gen, 
                                  epochs = 3 , callbacks = callbacks_list)

#########################################
y_pred = model.predict(test_X) 

#########################################
from sklearn.metrics import roc_curve, auc
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:,idx].astype(int), y_pred[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
c_ax.legend()
plt.plot([0, 1], [0, 1], 'k--')
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')
fig.savefig('trained_net.png')

#########################################
from sklearn.metrics import roc_auc_score
roc_auc_score(test_Y.astype(int), y_pred)

########################################
from sklearn.metrics import confusion_matrix
y_pred =np.argmax(y_pred,axis=1)
cm=confusion_matrix(np.argmax(test_Y, axis=1), y_pred)
#########################################
import seaborn as sn
import pandas as pd

plt.figure(figsize=(14, 14))
ax= plt.subplot()

df_cm = pd.DataFrame(cm, range(14), range(14))
# plt.figure(figsize=(10,7))
sn.set(font_scale=1.2) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 15}, cmap='Blues',ax = ax ) # font size
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass',
                          'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'],rotation=30, ha="right",rotation_mode="anchor"); 
ax.yaxis.set_ticklabels(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass',
                          'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'],rotation=30,  ha="right", rotation_mode="anchor");


plt.show()

#########################################
from sklearn.metrics import accuracy_score
accuracy_score(np.argmax(test_Y, axis=1), y_pred)