In [None]:
import random
import cv2
import pydicom

import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import seaborn as sns
from tqdm import tqdm


In [None]:
import matplotlib.pyplot as plt
from utilities_x_ray import read_xray,showXray


In [None]:
import numpy as np


In [None]:
# def seedAll(seed=355):
#     os.environ["PYTHONHASHSEED"] = str(seed)
#     np.random.seed(seed)
#     tf.random.set_seed(seed)
#     random.seed(seed)
# seedAll()

<h1 style="display:inline"> <a id="first"> First Look at the data</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a>

## 1. DataFrames

In [None]:
train = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
ss = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.image_id.describe()

In [None]:
train[train.image_id == '03e6ecfa6f6fb33dfeac6ca4f9b459c9']

In [None]:
train.class_name.value_counts()

### augment this dataset to reduce class imbalance

In [None]:
train.head(2)

In [None]:
train.rad_id.value_counts()

### R9, R8, R10 seem most hardworking

In [None]:
train_none = train[train.class_name == 'No finding']

In [None]:
train_none.shape

In [None]:
train_none.rad_id.value_counts()

In [None]:
# reliable_annotators = ['R8', 'R9', 'R10']
reliable_annotators = ['R9']


In [None]:
# train_none_reliable = train_none[train_none.rad_id.isin(reliable_annotators)]

In [None]:
# train_none_reliable.shape

In [None]:
train_reliable = train[train.rad_id.isin(reliable_annotators)]

In [None]:
train_reliable.shape

In [None]:
train.shape

In [None]:
train_reliable.class_name.value_counts()

In [None]:
train.class_name.value_counts()

### now we will only focus on train reliable

In [None]:
train = train_reliable

In [None]:
# docs = train[train.image_id == '03e6ecfa6f6fb33dfeac6ca4f9b459c9']

<ul>
<li><code>image_id</code> - unique image identifier</li>
<li><code>class_name</code>&nbsp;- the name of the class of detected object (or "No finding")</li>
<li><code>class_id</code>&nbsp;- the ID of the class of detected object</li>
<li><code>rad_id</code>&nbsp;- the ID of the radiologist that made the observation</li>
<li><code>x_min</code>&nbsp;- minimum X coordinate of the object's bounding box</li>
<li><code>y_min</code>&nbsp;- minimum Y coordinate of the object's bounding box</li>
<li><code>x_max</code>&nbsp;- maximum X coordinate of the object's bounding box</li>
<li><code>y_max</code>&nbsp;- maximum Y coordinate of the object's bounding box</li>
</ul>

In [None]:
ss.head()

The submission file must contain the image id and the prediction string in the format "a b (c,d,e,f)"<br>where
<ul>
    <li>a = predicted class ; 14 for no abnormality</li>
    <li>b= confidence</li>
    <li>(c,d,e,f) = (x_min,y_min,x_max,y_max)</li>
</ul>

## 2. Images

In [None]:
plt.figure(figsize=(8,10))
plt.imshow(read_xray('../input/vinbigdata-chest-xray-abnormalities-detection/train/03e6ecfa6f6fb33dfeac6ca4f9b459c9.dicom'),cmap=plt.cm.bone)

In [None]:
showXray('../input/vinbigdata-chest-xray-abnormalities-detection/train/03e6ecfa6f6fb33dfeac6ca4f9b459c9.dicom',train,with_boxes=True)

In [None]:
docs = train[train.image_id == '03e6ecfa6f6fb33dfeac6ca4f9b459c9']    

In [None]:
docs['x'] = docs['x_max'] - docs['x_min']

In [None]:
docs['y'] = docs['y_max'] - docs['y_min']

In [None]:
docs['area'] = docs['y'] * docs['x']

In [None]:
docs2 = docs.sort_values(by=['area'], ascending=False)

In [None]:
docs2 = docs2.head(10)

In [None]:
docs2

In [None]:
# train2 = docs.head(3)

In [None]:
# train2

In [None]:
# docs.head()

In [None]:
def complete_overlap(row1, row2):
    """
    is box1 from row1 completely inside box2 from row2
    """
    x_min_row1 = row1['x_min']
    x_min_row2 = row2['x_min']
    
    x_max_row1 = row1['x_max']
    x_max_row2 = row2['x_max']
    
    
    if (x_min_row1 > x_min_row2) and (x_max_row1 < x_max_row2):
        return True


    return False
    

In [None]:
complete_overlap(docs.iloc[1], docs.iloc[2])

In [None]:
complete_overlap(docs.iloc[3], docs.iloc[2])

In [None]:
showXray('../input/vinbigdata-chest-xray-abnormalities-detection/train/03e6ecfa6f6fb33dfeac6ca4f9b459c9.dicom',docs2,with_boxes=True)

<h1 style="display:inline"><a id="second">EDA</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a>

In [None]:
print("Number of rows in train dataframe: {}".format(train.shape[0]))
print("Number of Unique images in train set: {}".format(train.image_id.nunique()))
print("Number of Classes: {}\n".format(train.class_name.nunique()))
print("Class Names: {}".format(list(train.class_name.unique())))

In [None]:
print("Null Values:")
train.isna().sum().to_frame().rename(columns={0:'Null Value count'}).style.background_gradient('viridis')

The number of null values are same as the number of samples that do not have any abnormality

### The Distribution of Classes
We can see there is a huge class imbalance. The number of negative examples are very high and a few abnormalities have very few examples 

In [None]:
plt.figure(figsize=(9,6))
sns.countplot(train["class_id"]);
plt.title("Class Distributions");

### Distribution of Radiologists

In [None]:
plt.figure(figsize=(9,6))
sns.countplot(train["rad_id"]);
plt.title("rad_id Distributions");

<h1 style="display:inline"><a id="third"> An Intuition of the Data</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a><br><br>
<h5>Before proceeding further let us try and get an intuition of the data and what exactly we need to do.</h5>
<h5> In this competition we have been given 15000 images for training. Parallelly we have a dataframe containing the ground truths for various abnormalities. Every sample in the datframe contains:</h5>
  <ul>
      <li>the image id</li><li>the id of the radiologist who annoted it</li><li>the name of the corresponding class</li><li>the class id</li><li>the bounding box coordinates</li>
  </ul>
<b style="font-weight:700">Important points to be noted here are:</b>
<ul>
    <li>Each image may have multiple corresponding abnormalities. Therefore this is a multilabel prediction</li>
    <li>Bounding boxes for each image have been annoted by multiple radiologists. Therefore for every sample we have multiple ground truths. A naive way to deal with this is to take mean of bounding box coordinates by every radiologists for a particular abnormality</li>
    <li>There is a significant class imbalance which is likely to affect the performance of models a lot.</li>
</ul>
<h4 style="font-weight:700">Information about dicom can be found: <a href="https://en.wikipedia.org/wiki/DICOM" style="font-size:1em">Here</a></h4>
<h4 style="font-weight:700">Procedure to extract DICOM metadata can be found in: <a href="https://www.kaggle.com/mrutyunjaybiswal/vbd-chest-x-ray-abnormalities-detection-eda" style="font-size:1em">this notebook</a></h4>

<h1 style="display:inline"><a id="fourth">Data Preparation</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a>

In [None]:
class_names = sorted(train.class_name.unique())
del class_names[class_names.index('No finding')]
class_names = class_names+['No finding']
classes = dict(zip(list(range(15)),class_names))

In [None]:
classes

In [None]:
def prepareDataFrame(train_df= train):
    train_df = train_df.fillna(0)
#     train_df = train_df.head(10)
    
    cols = ['image_id','label']+list(range(4*len(class_names[:-1])))
    return_df = pd.DataFrame(columns=cols)
    
    for image in tqdm(train_df.image_id.unique()):
#         print('image=', image)
        df = train_df.query("image_id==@image")
#         print('df=', df)

        label = np.zeros(15)
        for cls in df.class_id.unique():
#             print('cls=', cls)
            label[int(cls)]=1
#             print('label=', label)
            
        bboxes_df = df.groupby('class_id')[['x_min','y_min','x_max','y_max']].mean().round()
#         print('bboxes_df=', bboxes_df)
        
        bboxes_list = [0 for i in range(60)]
        for ind in list(bboxes_df.index):
            bboxes_list[4*ind:4*ind+4] = list(bboxes_df.loc[ind,:].values)
        return_df.loc[len(return_df),:] = [image]+[label]+bboxes_list[:-4]
        
#         print('===========\n')
        
    return return_df
train_df = prepareDataFrame()

In [None]:
train_df.head(2)

In [None]:
train_df.shape

### filter just on class-name for now

In [None]:
docs = train_df[train_df.image_id == '03e6ecfa6f6fb33dfeac6ca4f9b459c9']

In [None]:
docs.iloc[0]

In [None]:
docs.iloc[0].label       

In [None]:
classes

In [None]:
train_df.columns

In [None]:
my_cols = ['image_id',    'label']
train_df = train_df[my_cols]


In [None]:
docs = train_df[train_df.image_id == '03e6ecfa6f6fb33dfeac6ca4f9b459c9']

In [None]:
docs

In [None]:
docs.iloc[0].label       

### now split for model

In [None]:
from sklearn.model_selection import KFold


In [None]:
# help(KFold)

In [None]:
def generateFolds(n_splits = None):
    kf = KFold(n_splits= n_splits)
    for id,(tr_,val_) in enumerate(kf.split(train_df["image_id"],train_df["label"])):
        train_df.loc[val_,'kfold'] = int(id)
    train_df["kfold"].astype(int)

generateFolds(n_splits=5)

In [None]:
train_df.kfold.value_counts()

In [None]:
# train_df.head(2000)

In [None]:
class DataLoader:
    def __init__(self,path = None,train_df=train_df,val_df=None):
        self.path = path
        self.df = train_df
        self.val_df = val_df
        self.train_list = [f'{img}.npy' for img in train_df["image_id"].unique()]
        np.random.shuffle(self.train_list)
        self.test_list = [f'{img}.npy' for img in val_df["image_id"].unique()]
        np.random.shuffle(self.test_list)
    
    def read_image(self):
        for img in self.train_list:
            im_name = img.split('.npy')[0]
            image = np.load(self.path+img)
            temp = self.df[self.df.image_id==im_name]
            c_label,bb = temp.iloc[0,1],temp.iloc[0,2:].values.astype('float')
            yield image,c_label,bb
    
    
    def batch_generator(self,items,batch_size):
        a=[]
        i=0
        for item in items:
            a.append(item)
            i+=1

            if i%batch_size==0:
                yield a
                a=[]
        if len(a) is not 0:
            yield a
            
    def flow(self,batch_size):
        """
        flow from given directory in batches
        ==========================================
        batch_size: size of the batch
        """
        while True:
            for bat in self.batch_generator(self.read_image(),batch_size):
                batch_images = []
                batch_c_labels = []
                batch_bb = []
                for im,im_c_label,im_bb in bat:
                    batch_images.append(im)
                    batch_c_labels.append(im_c_label)
                    batch_bb.append(im_bb)
                batch_images = np.stack(batch_images,axis=0)

#                 batch_labels =  (np.stack(batch_c_labels,axis=0),np.stack(batch_bb,axis=0))
                batch_labels =  np.stack(batch_c_labels,axis=0)
                yield batch_images,batch_labels
    
    def getVal(self):
        images = []
        c_labels = []
        bb_labels = []
        for img in self.test_list:
            im_name = img.split('.npy')[0]
            image = np.load(self.path+img)
            temp = self.val_df[self.val_df.image_id==im_name]
            c_label,bb = temp.iloc[0,1],temp.iloc[0,2:].values.astype('float')
            images.append(image)
            c_labels.append(c_label)
            bb_labels.append(bb)

#         return np.stack(images,axis=0),(np.stack(c_labels,axis=0),np.stack(bb_labels,axis=0))
        return np.stack(images,axis=0),np.stack(c_labels,axis=0)
    

In [None]:
# help(np.stack)

<h1 style="display:inline"><a id="fifth">Model Building and Training</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a>

In [None]:
import tensorflow as tf
# import tensorflow.keras.layers as L
import tensorflow.keras.backend as K


In [None]:
tf.__version__

In [None]:
from tensorflow.keras import regularizers

In [None]:
# help(regularizers.l2)

In [None]:
from tensorflow.keras.metrics import Recall, Precision

In [None]:
def build_v1():
    in1 = tf.keras.layers.Input(shape=(256,256,1))
    
#     out1 = tf.keras.layers.Conv2D(4,(3,3),activation="relu")(in1)
    out1 = tf.keras.layers.Conv2D(32,(3,3),
                                  activation="relu",
                                  padding='same')(in1)
    out1 = tf.keras.layers.MaxPooling2D((2,2))(out1)
    
    out1 = tf.keras.layers.Conv2D(32,(3,3),
                                  activation="relu",
                                  padding='same')(out1)
    out1 = tf.keras.layers.MaxPooling2D((2,2))(out1)

    out1 = tf.keras.layers.Flatten()(out1)

    out2 = tf.keras.layers.Dense(30,activation="relu")(out1)
    out2 = tf.keras.layers.Dense(30,activation="relu")(out2)
    out2 = tf.keras.layers.Dense(15,
                                 activation="sigmoid",
                                 name='class_out', 
                                 kernel_regularizer=regularizers.l2(0.01))(out2)

    model = tf.keras.Model(inputs=in1,outputs=out2)
    model.compile(loss={'class_out':'categorical_crossentropy'},
                  optimizer="adam",
                  metrics=[Recall(), Precision(), 'accuracy'])


    return model    

In [None]:
model = build_v1()

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

<h2>Training Loop</h2>

In [None]:
import os


In [None]:
# def getTest(path=None):
#     images = []
#     for img in tqdm(os.listdir(path)):
#         im_name = img.split('.npy')[0]
#         image = np.load(path+img)
#         images.append(image)
#     return np.stack(images,axis=0)

# # X_test = getTest('../input/xraynumpy/images/test/')

In [None]:
# X_test

In [None]:
# class_label = np.zeros((len(X_test),15))
# bb_label = np.zeros((len(X_test),56))

for fold in range(3):
    print(f'\nFold: {fold}\n')
    
#     X_train = train_df[train_df.kfold!=fold].drop('kfold',axis=1)
#     X_val = train_df[train_df.kfold==fold].drop('kfold',axis=1)
    X_train = train_df[train_df.kfold!=fold]
    X_val = train_df[train_df.kfold==fold]
    print('X_train.shape=',  X_train.shape)
    print('X_train.head()=',  X_train.head())
    
    print('-----------\n')
    
    print('X_val.shape=',  X_val.shape)
    print('X_val.head()=',  X_val.head())

    
    print('-----------\n')
    dl = DataLoader('../input/xraynumpy/images/train/',X_train,X_val)
    train_set = dl.flow(batch_size=32)

    X_eval,Y_eval = dl.getVal()
#     print('X_eval[0]=', X_eval[0])
    print('X_eval.shape=', X_eval.shape)

#     print('Y_eval[0]=', Y_eval[0])
    print('Y_eval.shape=', Y_eval.shape)
    
    
    chckpt = tf.keras.callbacks.ModelCheckpoint(f'./model_f{fold}.hdf5',monitor='val_loss',mode='min',save_best_only=True)
    
    K.clear_session()
    model = build_v1()
    
    print('-----------\n')
    model.fit(train_set,
              epochs=10,
              steps_per_epoch=int(15000/32),
              validation_data = (X_eval,Y_eval),
              callbacks = [chckpt]
             )
    
    break
    


In [None]:
chckpt

In [None]:
ls {'./model_f0.hdf5'}

In [None]:
model

In [None]:
# model.load_weights('./model_f0.hdf5')

In [None]:
# model.summary()

In [None]:
# test

In [None]:
# Y_eval.shape

In [None]:
# Y_eval[0]

In [None]:
c = model.predict(X_eval)


### now predict

In [None]:
# X_test.shape
X_eval.shape
# X_val.shape

In [None]:
# c = model.predict(X_test)
c = model.predict(X_eval)



In [None]:
c.shape

In [None]:
c[0]

In [None]:
Y_eval[0]

In [None]:
X_val.iloc[0]

In [None]:
classes

In [None]:
showXray('../input/vinbigdata-chest-xray-abnormalities-detection/train/9a5094b2563a1ef3ff50dc5c7ff71345.dicom',train,with_boxes=True)

In [None]:
#     class_label+=c
#     bb_label+=b
# class_label = class_label/5
# bb_label = bb_label/5
# np.save('./class_label.npy',class_label)
# np.save('./bb_label.npy',bb_label)

### get perf metrics per class

In [None]:
y = Y_eval

In [None]:
y_pred = model.predict(X_eval)


In [None]:
def get_true_pos(y, pred, th=0.5):
    pred_t = (pred > th)
    return np.sum((pred_t == True) & (y == 1))


def get_true_neg(y, pred, th=0.5):
    pred_t = (pred > th)
    return np.sum((pred_t == False) & (y == 0))


def get_false_neg(y, pred, th=0.5):
    pred_t = (pred > th)
    return np.sum((pred_t == False) & (y == 1))


def get_false_pos(y, pred, th=0.5):
    pred_t = (pred > th)
    return np.sum((pred_t == True) & (y == 0))

In [None]:
def true_positives(y, pred, th=0.5):
    """
    Count true positives.

    Args:
        y (np.array): ground truth, size (n_examples)
        pred (np.array): model output, size (n_examples)
        th (float): cutoff value for positive prediction from model
    Returns:
        TP (int): true positives
    """
    TP = 0
    
    # get thresholded predictions
    thresholded_preds = pred >= th

    # compute TP
    TP = np.sum((y == 1) & (thresholded_preds == 1))
    
    return TP

def true_negatives(y, pred, th=0.5):
    """
    Count true negatives.

    Args:
        y (np.array): ground truth, size (n_examples)
        pred (np.array): model output, size (n_examples)
        th (float): cutoff value for positive prediction from model
    Returns:
        TN (int): true negatives
    """
    TN = 0
    
    # get thresholded predictions
    thresholded_preds = pred >= th

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # compute TN
    TN = np.sum((y == 0) & (thresholded_preds == 0))

    ### END CODE HERE ###
    
    return TN

def false_positives(y, pred, th=0.5):
    """
    Count false positives.

    Args:
        y (np.array): ground truth, size (n_examples)
        pred (np.array): model output, size (n_examples)
        th (float): cutoff value for positive prediction from model
    Returns:
        FP (int): false positives
    """
    FP = 0
    
    # get thresholded predictions
    thresholded_preds = pred >= th
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###

    # compute FP
    FP = np.sum((y == 0) & (thresholded_preds == 1))

    ### END CODE HERE ###
    
    return FP

def false_negatives(y, pred, th=0.5):
    """
    Count false positives.

    Args:
        y (np.array): ground truth, size (n_examples)
        pred (np.array): model output, size (n_examples)
        th (float): cutoff value for positive prediction from model
    Returns:
        FN (int): false negatives
    """
    FN = 0
    
    # get thresholded predictions
    thresholded_preds = pred >= th

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # compute FN
    FN = np.sum((y == 1) & (thresholded_preds == 0))

    ### END CODE HERE ###
    
    return FN

In [None]:
def get_accuracy(y, pred, th=0.5):
    """
    Compute accuracy of predictions at threshold.

    Args:
        y (np.array): ground truth, size (n_examples)
        pred (np.array): model output, size (n_examples)
        th (float): cutoff value for positive prediction from model
    Returns:
        accuracy (float): accuracy of predictions at threshold
    """
    accuracy = 0.0
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # get TP, FP, TN, FN using our previously defined functions
    TP = true_positives(y, pred, th)
    FP = false_positives(y, pred, th)
    TN = true_negatives(y, pred, th)
    FN = false_negatives(y, pred, th)

    # Compute accuracy using TP, FP, TN, FN
    accuracy = (TP + TN) / (TP + TN + FP + FN) 
    
    ### END CODE HERE ###
    
    return accuracy

In [None]:
def get_performance_metrics(y, 
                            pred, 
                            class_labels, 
                            tp=get_true_pos,
                            tn=get_true_neg, 
                            fp=get_false_pos,
                            fn=get_false_neg,
                            acc=None, 
                            prevalence=None, 
                            spec=None,
                            sens=None, 
                            ppv=None, 
                            npv=None, 
                            auc=None, 
                            f1=None,
                            thresholds=[]):
    if len(thresholds) != len(class_labels):
        thresholds = [.5] * len(class_labels)

    columns = ["", "TP", "TN", "FP", "FN", "Accuracy", "Prevalence",
               "Sensitivity",
               "Specificity", "PPV", "NPV", "AUC", "F1", "Threshold"]
    df = pd.DataFrame(columns=columns)
    for i in range(len(class_labels)):
        df.loc[i] = [""] + [0] * (len(columns) - 1)
        df.loc[i][0] = class_labels[i]
        df.loc[i][1] = round(tp(y[:, i], pred[:, i]),
                             3) if tp != None else "Not Defined"
        df.loc[i][2] = round(tn(y[:, i], pred[:, i]),
                             3) if tn != None else "Not Defined"
        df.loc[i][3] = round(fp(y[:, i], pred[:, i]),
                             3) if fp != None else "Not Defined"
        df.loc[i][4] = round(fn(y[:, i], pred[:, i]),
                             3) if fn != None else "Not Defined"
        df.loc[i][5] = round(acc(y[:, i], pred[:, i], thresholds[i]),
                             3) if acc != None else "Not Defined"
        df.loc[i][6] = round(prevalence(y[:, i]),
                             3) if prevalence != None else "Not Defined"
        df.loc[i][7] = round(sens(y[:, i], pred[:, i], thresholds[i]),
                             3) if sens != None else "Not Defined"
        df.loc[i][8] = round(spec(y[:, i], pred[:, i], thresholds[i]),
                             3) if spec != None else "Not Defined"
        df.loc[i][9] = round(ppv(y[:, i], pred[:, i], thresholds[i]),
                             3) if ppv != None else "Not Defined"
        df.loc[i][10] = round(npv(y[:, i], pred[:, i], thresholds[i]),
                              3) if npv != None else "Not Defined"
        df.loc[i][11] = round(auc(y[:, i], pred[:, i]),
                              3) if auc != None else "Not Defined"
        df.loc[i][12] = round(f1(y[:, i], pred[:, i] > thresholds[i]),
                              3) if f1 != None else "Not Defined"
        df.loc[i][13] = round(thresholds[i], 3)

    df = df.set_index("")
    return df

In [None]:
classes

In [None]:
class_labels = list(classes.values())

In [None]:
class_labels

In [None]:
from sklearn.metrics import roc_auc_score, f1_score

In [None]:
get_performance_metrics(y, 
                        y_pred, 
                        class_labels, 
                        acc=get_accuracy, 
                        auc=roc_auc_score,
                        f1=f1_score)

### now try different and simpler model

In [None]:
def build_v2():
    in1 = tf.keras.layers.Input(shape=(256,256,1))
    
    out1 = tf.keras.layers.Conv2D(64,(3,3),
                                  activation="relu")(in1)
    
    
    out1 = tf.keras.layers.MaxPooling2D((2,2))(out1)

    out1 = tf.keras.layers.Conv2D(64,(3,3),
                                  activation="relu")(out1)
    
    out1 = tf.keras.layers.MaxPooling2D((2,2))(out1)

    out1 = tf.keras.layers.Flatten()(out1)

    out2 = tf.keras.layers.Dense(128,activation="relu")(out1)

    out2 = tf.keras.layers.Dense(15,
                                 activation="sigmoid")(out2)

    model2 = tf.keras.Model(inputs=in1,outputs=out2)

    return model2  

In [None]:
model2 = build_v2()

In [None]:
# model2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




In [None]:
model2.summary()


In [None]:
# class_label = np.zeros((len(X_test),15))
# bb_label = np.zeros((len(X_test),56))

for fold in range(3):
    print(f'\nFold: {fold}\n')
    
#     X_train = train_df[train_df.kfold!=fold].drop('kfold',axis=1)
#     X_val = train_df[train_df.kfold==fold].drop('kfold',axis=1)
    X_train = train_df[train_df.kfold!=fold]
    X_val = train_df[train_df.kfold==fold]
    print('X_train.shape=',  X_train.shape)
    print('X_train.head()=',  X_train.head())
    
    print('-----------\n')
    
    print('X_val.shape=',  X_val.shape)
    print('X_val.head()=',  X_val.head())

    
    print('-----------\n')
    dl = DataLoader('../input/xraynumpy/images/train/',X_train,X_val)
    train_set = dl.flow(batch_size=32)

    X_eval,Y_eval = dl.getVal()
#     print('X_eval[0]=', X_eval[0])
    print('X_eval.shape=', X_eval.shape)

#     print('Y_eval[0]=', Y_eval[0])
    print('Y_eval.shape=', Y_eval.shape)
    
    
    chckpt = tf.keras.callbacks.ModelCheckpoint(f'./model2_f{fold}.hdf5',monitor='val_loss',mode='min',save_best_only=True)
    
    K.clear_session()
#     model = build_v1()
    
    print('-----------\n')
    model2.fit(train_set,
              epochs=10,
              steps_per_epoch=int(15000/32),
              validation_data = (X_eval,Y_eval),
              callbacks = [chckpt]
             )
    
    break
    


In [None]:
# model2.predict(X_eval)
model2.evaluate(X_eval, Y_eval)


In [None]:
y = Y_eval

In [None]:
y_pred2 = model2.predict(X_eval)


In [None]:
get_performance_metrics(y, 
                        y_pred2, 
                        class_labels, 
                        acc=get_accuracy, 
                        auc=roc_auc_score,
                        f1=f1_score)

### retrain model after changing compile method

https://datascience.stackexchange.com/questions/25752/how-does-keras-calculate-accuracy-for-multi-label-classification

In [None]:
model2 = build_v2()

In [None]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# class_label = np.zeros((len(X_test),15))
# bb_label = np.zeros((len(X_test),56))

for fold in range(3):
    print(f'\nFold: {fold}\n')
    
#     X_train = train_df[train_df.kfold!=fold].drop('kfold',axis=1)
#     X_val = train_df[train_df.kfold==fold].drop('kfold',axis=1)
    X_train = train_df[train_df.kfold!=fold]
    X_val = train_df[train_df.kfold==fold]
    print('X_train.shape=',  X_train.shape)
    print('X_train.head()=',  X_train.head())
    
    print('-----------\n')
    
    print('X_val.shape=',  X_val.shape)
    print('X_val.head()=',  X_val.head())

    
    print('-----------\n')
    dl = DataLoader('../input/xraynumpy/images/train/',X_train,X_val)
    train_set = dl.flow(batch_size=32)

    X_eval,Y_eval = dl.getVal()
#     print('X_eval[0]=', X_eval[0])
    print('X_eval.shape=', X_eval.shape)

#     print('Y_eval[0]=', Y_eval[0])
    print('Y_eval.shape=', Y_eval.shape)
    
    
    chckpt = tf.keras.callbacks.ModelCheckpoint(f'./model2_f{fold}.hdf5',monitor='val_loss',mode='min',save_best_only=True)
    
    K.clear_session()
#     model = build_v1()
    
    print('-----------\n')
    model2.fit(train_set,
              epochs=10,
              steps_per_epoch=int(15000/32),
              validation_data = (X_eval,Y_eval),
              callbacks = [chckpt]
             )
    
    break
    


In [None]:
y_pred2 = model2.predict(X_eval)


In [None]:
get_performance_metrics(y, 
                        y_pred2, 
                        class_labels, 
                        acc=get_accuracy, 
                        auc=roc_auc_score,
                        f1=f1_score)

In [None]:
model2b = build_v2()

In [None]:
model2b.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])


In [None]:
# class_label = np.zeros((len(X_test),15))
# bb_label = np.zeros((len(X_test),56))

for fold in range(3):
    print(f'\nFold: {fold}\n')
    
#     X_train = train_df[train_df.kfold!=fold].drop('kfold',axis=1)
#     X_val = train_df[train_df.kfold==fold].drop('kfold',axis=1)
    X_train = train_df[train_df.kfold!=fold]
    X_val = train_df[train_df.kfold==fold]
    print('X_train.shape=',  X_train.shape)
    print('X_train.head()=',  X_train.head())
    
    print('-----------\n')
    
    print('X_val.shape=',  X_val.shape)
    print('X_val.head()=',  X_val.head())

    
    print('-----------\n')
    dl = DataLoader('../input/xraynumpy/images/train/',X_train,X_val)
    train_set = dl.flow(batch_size=32)

    X_eval,Y_eval = dl.getVal()
#     print('X_eval[0]=', X_eval[0])
    print('X_eval.shape=', X_eval.shape)

#     print('Y_eval[0]=', Y_eval[0])
    print('Y_eval.shape=', Y_eval.shape)
    
    
    chckpt = tf.keras.callbacks.ModelCheckpoint(f'./model2_f{fold}.hdf5',monitor='val_loss',mode='min',save_best_only=True)
    
    K.clear_session()
#     model = build_v1()
    
    print('-----------\n')
    model2b.fit(train_set,
              epochs=10,
              steps_per_epoch=int(15000/32),
              validation_data = (X_eval,Y_eval),
              callbacks = [chckpt]
             )
    
    break
    


In [None]:
y_pred2b = model2b.predict(X_eval)


In [None]:
df = get_performance_metrics(y, 
                        y_pred2b, 
                        class_labels, 
                        acc=get_accuracy, 
                        auc=roc_auc_score,
                        f1=f1_score)

# Work in Progress....
<h2 style="color:blue">To Do:</h2>
<ul>
    <li><h2 style="color:blue">1.Implement submission pipeline</h2></li>
</ul>