In [None]:
import numpy as np 
import pandas as pd 
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow.keras import Model
from tqdm import notebook

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f'Running on TPU {tpu.master()}')
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

In [None]:
train  =  pd.read_csv("../input/vinbigdata-chest-xray-abnormalities-detection/train.csv")
test = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/sample_submission.csv')

train_dir256 = "../input/vinbigdata-chest-xray-resized-png-256x256/train"
test_dir256 = "../input/vinbigdata-chest-xray-resized-png-256x256/test"


train['image_png'] = train.image_id+'.png'
test['image_png'] = test.image_id+'.png'

IMAGE_SIZE256 = [256, 256] 
BATCH_SIZE = 32  
EPOCHS = 2
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=0.001)    

In [None]:
print(train.info())
train.head()

In [None]:
is_fold_train = train.groupby("image_png")["class_id"].agg(lambda s: 
(s == 14).sum()).reset_index().rename({
    "class_id": "num_normal_annotations"}, axis=1)
is_fold_train.head()

In [None]:
def change(x):
    if (x==3):
        x=1
    return x

is_fold_train['target'] = is_fold_train['num_normal_annotations'].apply(lambda x: change(x))
is_fold_train.head()

In [None]:
skfolds = StratifiedKFold(n_splits=5, 
                          random_state=42, 
                          shuffle = True)
    
for num_fold, (train_index, val_index) in enumerate(skfolds.split(is_fold_train, is_fold_train.target)):
    is_fold_train.loc[val_index, 'fold'] = int(num_fold)
    
is_fold_train['target'] = is_fold_train.target.astype('str')

In [None]:
print(is_fold_train.info())
is_fold_train.head()

In [None]:
datagen_train = ImageDataGenerator(
                        rotation_range=40,          
                        width_shift_range=0.2,   
                        height_shift_range=0.2,  
                        zoom_range=0.2,           
                        horizontal_flip=True,     
                        vertical_flip=False      
                                   )     

datagen_test =  ImageDataGenerator(validation_split = 0.2) 

In [None]:
data_augmentation = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
  tf.keras.layers.experimental.preprocessing.RandomRotation(0.2),
])

In [None]:
with strategy.scope(): 

    class MyModel(Model):
        def __init__(self, appl, training=True):
            super(MyModel, self).__init__()
            self.d1_0 = tf.keras.applications.EfficientNetB0(
                input_shape=(*IMAGE_SIZE256, 3),
                weights='imagenet', pooling='avg', 
                include_top=False)
            
            self.d1_1 = tf.keras.applications.DenseNet121(
                input_shape=(*IMAGE_SIZE256, 3),
                weights=None, pooling='max') 
                                                           
            self.d2 = tf.keras.layers.Dense(150, activation='relu')
            self.d3 = tf.keras.layers.Dense(2, activation='sigmoid')
            self.dropout = tf.keras.layers.Dropout(0.15)
            
            
            #self.augmentation1 = tf.keras.layers.experimental.preprocessing.RandomFlip(
                #"horizontal_and_vertical")
            # self.augmentation2 =  tf.keras.layers.experimental.preprocessing.RandomRotation(0.2)
            #self.augmentation = data_augmentation
            
            self.training = training
            self.appl = appl

        #def call(self, x, training=False, appl=True):
        def call(self, x):
    
            if self.appl:
                x = self.d1_0(x)
            else:
                #x = self.augmentation(x)
                x = self.d1_1(x)
            
            if self.training:
                x = self.dropout(x)
            x = self.d2(x)
            return self.d3(x)
    
    
model1 = MyModel(appl=True)
model2 = MyModel(appl=False)

model1.compile(
        optimizer = OPTIMIZER,
        loss = "binary_crossentropy",
        metrics = [tf.keras.metrics.BinaryAccuracy()]
    )

model2.compile(
        optimizer = OPTIMIZER,
        loss = "binary_crossentropy",
        metrics = [tf.keras.metrics.BinaryAccuracy()]
    )

In [None]:
loss_history = [] 
val_loss_history = []

binary_accuracy_history = []
val_binary_accuracy_history = []

In [None]:
for fold_n in range(5): 
    print('Fold #{}'.format(fold_n+1))
    
    train_data = is_fold_train[is_fold_train.fold != fold_n]
    val_data = is_fold_train[is_fold_train.fold == fold_n] 
    
    train_dataset = datagen_train.flow_from_dataframe(
        train_data,                                              
        directory = train_dir256,
        subset = "training",
        x_col = "image_png",
        y_col = "target",
        shuffle=True,
        batch_size=BATCH_SIZE)
            
            

    valid_dataset = datagen_test.flow_from_dataframe(
        val_data,
        directory = train_dir256,
        subset = "validation",
        x_col = "image_png",
        y_col = "target",
        shuffle=True,
        batch_size=BATCH_SIZE)
    
    model_fit = model1.fit(train_dataset, validation_data=valid_dataset, epochs=EPOCHS) 
    
    loss_history.append(model_fit.history['loss'])
    val_loss_history.append(model_fit.history['val_loss'])
    binary_accuracy_history.append(model_fit.history['binary_accuracy'])
    val_binary_accuracy_history.append(model_fit.history['val_binary_accuracy'])
    
    
lh = [item for sublist in loss_history for item in sublist]
vlh = [item for sublist in val_loss_history for item in sublist]

bah = [item for sublist in binary_accuracy_history for item in sublist]
vbah = [item for sublist in val_binary_accuracy_history for item in sublist]

In [None]:
plt.plot(lh, label='train')
plt.plot(vlh, label='test')
plt.title('loss')
plt.legend()
plt.show()
plt.plot(bah, label='train')
plt.plot(vbah, label='test')
plt.title('binary_accuracy')
plt.legend()
plt.show()

In [None]:
train_loss_history = [] 
test_loss_history = []

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

#@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        
        predictions = model1(images, training=True)
        loss_value = tf.keras.losses.binary_crossentropy(labels, predictions) 
    
    train_loss_history.append(train_loss(loss_value.numpy())) 
    
    grads = tape.gradient(loss_value, model1.trainable_variables)
    OPTIMIZER.apply_gradients(zip(grads, model2.trainable_variables))

    train_loss(loss_value)
    train_accuracy(labels, predictions)
    
    
#@tf.function
def test_step(images, labels):
    
    predictions = model2(images, training=False)
    
    t_loss = tf.keras.losses.binary_crossentropy(labels, predictions)
    
    test_loss_history.append(test_loss(t_loss.numpy()))
    
    test_loss(t_loss)
    test_accuracy(labels, predictions)

In [None]:
def train(epochs):
    for fold_n in range(5):
        
        train_data = is_fold_train[is_fold_train.fold != fold_n]
        val_data = is_fold_train[is_fold_train.fold == fold_n]
        
        for epoch in range(epochs):
            print('Epoch: {}'.format(epoch+1))
                  
            train_dataset = datagen_train.flow_from_dataframe( 
                train_data,                                              
                directory = train_dir256,
                subset = "training",
                x_col = "image_png",
                y_col = "target",
                shuffle=True,
                batch_size=BATCH_SIZE
                                                             )
            
            

            valid_dataset = datagen_test.flow_from_dataframe( 
                val_data,
                directory = train_dir256,
                subset = "validation",
                x_col = "image_png",
                y_col = "target",
                shuffle=True,
                batch_size=BATCH_SIZE
                                                             )
            
            for (batch, (images, labels)) in notebook.tqdm(enumerate(train_dataset)):
                train_step(images, labels)
                
                
            for (batch, (test_images, test_labels)) in enumerate(valid_dataset):            
                test_step(test_images, test_labels)
                
            
            template = 'Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
            
            print(template.format(
                             train_loss.result(),
                             train_accuracy.result()*100,
                             test_loss.result(),
                             test_accuracy.result()*100
            ))
            

In [None]:
#train(1)

In [None]:
loss_history = [] 
val_loss_history = []

binary_accuracy_history = []
val_binary_accuracy_history = []

In [None]:
for fold_n in range(5):
    print('Fold #{}'.format(fold_n+1))
    
    train_data = is_fold_train[is_fold_train.fold != fold_n]
    val_data = is_fold_train[is_fold_train.fold == fold_n]
    
                  
    train_dataset = datagen_train.flow_from_dataframe(
        train_data,                                              
        directory = train_dir256,
        subset = "training",
        x_col = "image_png",
        y_col = "target",
        shuffle=True,
        batch_size=BATCH_SIZE)
            

    valid_dataset = datagen_test.flow_from_dataframe(
        val_data,
        directory = train_dir256,
        subset = "validation",
        x_col = "image_png",
        y_col = "target",
        shuffle=True,
        batch_size=BATCH_SIZE)
            
    model_fit = model2.fit(train_dataset, validation_data=valid_dataset, epochs=EPOCHS)
    
    loss_history.append(model_fit.history['loss'])
    val_loss_history.append(model_fit.history['val_loss'])
    binary_accuracy_history.append(model_fit.history['binary_accuracy'])
    val_binary_accuracy_history.append(model_fit.history['val_binary_accuracy'])
    
            
lh = [item for sublist in loss_history for item in sublist]
vlh = [item for sublist in val_loss_history for item in sublist]

bah = [item for sublist in binary_accuracy_history for item in sublist]
vbah = [item for sublist in val_binary_accuracy_history for item in sublist]

In [None]:
plt.plot(lh, label='train')
plt.plot(vlh, label='test')
plt.title('loss')
plt.legend()
plt.show()
plt.plot(bah, label='train')
plt.plot(vbah, label='test')
plt.title('binary_accuracy')
plt.legend()
plt.show()

In [None]:
test_dataset = ImageDataGenerator().flow_from_dataframe(
                                test,
                                directory = test_dir256,
                                x_col = "image_png",
                                class_mode='raw',
                                y_col = "image_id",
                                batch_size=3000)

In [None]:
x, y = test_dataset.next()

In [None]:
print(x.shape)

In [None]:
preds1 = model1.predict(x)
preds2 = model2.predict(x)
predictions = np.argmax(preds1 + preds2, axis=-1)

In [None]:
submission = pd.DataFrame({'image_id': y, 'label': predictions})  

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)