In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import seaborn as sns

import tensorflow as tf

from keras.datasets import mnist
import matplotlib.pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
from keras.callbacks import TensorBoard as tb
from keras.preprocessing.image import img_to_array, load_img
import numpy as np
from sklearn.datasets import load_files

from tensorflow.keras.preprocessing import image

import keras.layers as Layers
from PIL import Image

import datetime, os
import random

## Algumas Funções

In [None]:
# Convertendo as imagens em arrays
def lendo_img(filename, size, path):
    img = image.load_img(os.path.join(path, filename), target_size=size)
    #convertendo para array e reescalando a intensidade dos píxeis.
    img = image.img_to_array(img) / 255
    return img


## Leitura dos arquivos

In [None]:
# fazendo a leitura dos dados nos diretórios
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)
        #print(os.path.join(dirname, filename))

In [None]:
# abrindo o dataframe com os metadados

df = pd.read_csv('../input/coronahack-chest-xraydataset/Chest_xray_Corona_Metadata.csv')
df.head()

No dataset, temos :
- **X_ray_image_name**: nome do Raio-x;
- **Label**: com o rótulo de 'normal' ou de 'pneumonia';
- **Dataset_type**: com informação se é pertencente ao conjunto de treino ou de teste;
- **Label_2_Virus_category**: informações sobre o vírus detectado [nan, 'Streptococcus', 'COVID-19', 'ARDS', 'SARS']
- **Label_1_Virus_category**: informações sobre o vírus detectado [nan, 'Virus', 'bacteria', 'Stress-Smoking']

### Separando treino e teste

In [None]:
treino = df.loc[df.Dataset_type=='TRAIN']
teste = df.loc[df.Dataset_type=='TEST']

In [None]:
print('Treino: ',treino.shape)
print('Teste: ',teste.shape)

In [None]:
x_treino = treino.loc[:, treino.columns != 'Label']
y_treino = treino[['Label']]

x_teste = teste.loc[:, teste.columns != 'Label']
y_teste = teste[['Label']]

## Limpeza simples

In [None]:
#Valores nulos
df.isnull().sum()

In [None]:
# Preenchendo com 'unkown' os valores desconhecidos para os labels de vírus
df.fillna('unknown', inplace=True)

In [None]:
#show datatypes
df.info()

In [None]:
print(df['Label_1_Virus_category'].value_counts())
print('='*50)
print(df['Label_2_Virus_category'].value_counts())

> Temos mais informações sobre vírus na variável Label_1_Virus_category.

## Visualizando as imagens

- Vamos olhar algumas informações das imagens, além das próprias imagens de raio-X.

In [None]:
df.head(3)

In [None]:
# o caminho para as imagens de treino e teste
treino_img_path = '../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train/'
teste_img_path = '../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/test/'

In [None]:
# Verificando algumas imagens de treino

amostra = np.random.randint(len(treino), size=(10))

fig, axs = plt.subplots(2,5, figsize=(25, 9))
for i, ax in zip(amostra,axs.flat):
    #print('Image: %s' %df_treino.fileName[i])
    ax.imshow(Image.open(treino_img_path + treino.X_ray_image_name[i]),cmap = 'bone')
    ax.set_title(str(treino.Label[i]))

In [None]:
# Verificando algumas imagens de teste

amostra = random.choices(teste.index, k=10)

fig, axs = plt.subplots(2,5, figsize=(25, 9))
for i, ax in zip(amostra,axs.flat):
    ax.imshow(Image.open(teste_img_path + teste.X_ray_image_name[i]),cmap = 'bone')
    ax.set_title(str(teste.Label[i]))

### Comparando os raios-x

In [None]:
comparacao = [1,9,5217,5220,5221,5225]

In [None]:
# Comparando imagens de raio-x: saudável, pneumonia por Streptococcus e pneumonia por COVID.

fig, axs = plt.subplots(3,2, figsize=(10, 12))
for i, ax in zip(comparacao,axs.flat):
    ax.imshow(Image.open(treino_img_path + treino.X_ray_image_name[i]),cmap = 'bone')
    ax.set_title(str(treino.Label[i]) + ' ' + str(treino.Label_2_Virus_category[i]))

In [None]:
treino['Label'].value_counts()

In [None]:
teste['Label'].value_counts()

In [None]:
del df

## Transformando as imagens

In [None]:
treino.head(2)

In [None]:
# add a target and class feature
treino['class'] = treino.Label.apply(lambda x: 'negative' if x=='Normal' else 'positive')
teste['class'] = teste.Label.apply(lambda x: 'negative' if x=='Normal' else 'positive')

treino['target'] = treino.Label.apply(lambda x: 0 if x=='Normal' else 1)
teste['target'] = teste.Label.apply(lambda x: 0 if x=='Normal' else 1)

In [None]:
#get the important features
final_train_data = treino[['X_ray_image_name', 'class', 'target', 'Label_2_Virus_category']]
final_test_data = teste[['X_ray_image_name', 'class', 'target']]

In [None]:
# extract the image from traing data and test data, then convert them as array
treino_arrays = [] 
treino['X_ray_image_name'].apply(lambda x: treino_arrays.append(lendo_img(x, (255,255), treino_img_path)))
teste_arrays = []
teste['X_ray_image_name'].apply(lambda x: teste_arrays.append(lendo_img(x, (255,255), teste_img_path)))

In [None]:
treino_arrays[2].shape

In [None]:
print(len(treino_arrays))
print(len(teste_arrays))

In [None]:
lendo_img('IM-0128-0001.jpeg', (255,255), treino_img_path)

In [None]:
imagem = lendo_img('IM-0128-0001.jpeg', (255,255), treino_img_path)
imagem.shape

In [None]:
#plt.figure(0)
#plt.subplot(221)
plt.imshow(imagem, cmap=plt.get_cmap('gray'))

In [None]:
final_test_data.head(2)

In [None]:
del treino
del teste

In [None]:
train_tensors = tf.convert_to_tensor(np.array(treino_arrays))
test_tensors  = tf.convert_to_tensor(np.array(teste_arrays))
y_train_tensor = tf.convert_to_tensor(final_train_data.target.values)
y_test_tensor = tf.convert_to_tensor(final_test_data['target'].values)

In [None]:
train_tensors = tf.convert_to_tensor(np.array(treino_arrays))
test_tensors  = tf.convert_to_tensor(np.array(teste_arrays))
y_train_tensor = tf.convert_to_tensor(final_train_data.target.values)
y_test_tensor = tf.convert_to_tensor(final_test_data['target'].values)


train_dataset = tf.data.Dataset.from_tensor_slices((train_tensors, y_train_tensor))
test_dataset = tf.data.Dataset.from_tensor_slices((test_tensors, y_test_tensor))

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_tensors, y_train_tensor))
test_dataset = tf.data.Dataset.from_tensor_slices((test_tensors, y_test_tensor))

# LeNet5

Nossa arquitetura é composta por
- camada convolucional
- pooling
- camada convolucional
- pooling
- camada densamente conectada

In [None]:
# Cria o modelo
model = Sequential()
#Convolução 2D com função de ativação Rectified Linear Units 32 kernels/Pesos (filtros) 
model.add(Conv2D(32, (5, 5), input_shape=(255, 255, 3), activation='relu'))#
print( model.output_shape)
#Camada de Pooling 	    
model.add(MaxPooling2D(pool_size=(2, 2)))


#Convolução 2D com função de ativação Rectified Linear Units 64 kernels/Pesos (filtros) 
model.add(Conv2D(64, (5, 5), activation='relu'))
print( model.output_shape)
#Camada de Pooling 	
model.add(MaxPooling2D(pool_size=(2, 2)))

#Remove 20% das ativações de entrada aleatoriamente 
model.add(Dropout(0.2))
#Converte o conjunto de imagens e um vetor unidimensional para a entrada da rede 
#neural totalmente conectada
model.add(Flatten())
print( model.output_shape)



model.add(Dense(32, activation='sigmoid'))
print( model.output_shape)
model.add(Dense(16, activation='sigmoid'))
print( model.output_shape)


#Função de Ativação Softmax
model.add(Dense(1, activation='sigmoid'))
#print( model.output_shape)

In [None]:
print(model.summary())


from keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
@model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


train_tensors = tf.convert_to_tensor(np.array(treino_arrays))
test_tensors  = tf.convert_to_tensor(np.array(teste_arrays))
y_train_tensor = tf.convert_to_tensor(final_train_data.target.values)
y_test_tensor = tf.convert_to_tensor(final_test_data['target'].values)


train_dataset = tf.data.Dataset.from_tensor_slices((train_tensors, y_train_tensor))
test_dataset = tf.data.Dataset.from_tensor_slices((test_tensors, y_test_tensor))

In [None]:
history = model.fit(train_tensors, y_train_tensor, epochs=4, batch_size=500)

In [None]:
#history = model.fit(train_batches, epochs=10, validation_data=test_batches, callbacks=[callbacks])

In [None]:
plt.figure(1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
testando.head(2)

In [None]:
for i,l in train_dataset.take(8):
    plt.imshow(i);

In [None]:
for i,l in train_dataset.take(18):
    plt.imshow(i);

In [None]:
BATCH_SIZE = 16
BUFFER = 1000

train_batches = train_dataset.shuffle(BUFFER).batch(BATCH_SIZE)
test_batches = test_dataset.batch(BATCH_SIZE)

for i,l in train_batches.take(1):
    print('Train Shape per Batch: ',i.shape);
for i,l in test_batches.take(1):
    print('Test Shape per Batch: ',i.shape);

# Cria o modelo
model = Sequential()
#Convolução 2D com função de ativação Rectified Linear Units 32 kernels/Pesos (filtros) 
model.add(Conv2D(32, (5, 5), input_shape=(28, 28, 1), activation='relu'))#
print( model.output_shape)
#Camada de Pooling 	    
model.add(MaxPooling2D(pool_size=(2, 2)))


#Convolução 2D com função de ativação Rectified Linear Units 64 kernels/Pesos (filtros) 
model.add(Conv2D(64, (5, 5), activation='relu'))
print( model.output_shape)
#Camada de Pooling 	
model.add(MaxPooling2D(pool_size=(2, 2)))

#Remove 20% das ativações de entrada aleatoriamente 
model.add(Dropout(0.2))
#Converte o conjunto de imagens e um vetor unidimensional para a entrada da rede neural totalmente conectada
model.add(Flatten())
print( model.output_shape)



model.add(Dense(32, activation='sigmoid'))
print( model.output_shape)
model.add(Dense(16, activation='sigmoid'))
print( model.output_shape)


#Função de Ativação Softmax
#model.add(Dense(num_classes, activation='softmax'))
#print( model.output_shape)

history = model.fit(X_train, y_train, validation_split = 0.2, epochs=3, batch_size=500)

In [None]:
#define input shape
INPUT_SHAPE = (255,255,3) 

#get the pretrained model
base_model = tf.keras.applications.ResNet50(input_shape= INPUT_SHAPE,
                                               include_top=False,
                                               weights='imagenet')

#set the trainable method of covolution layer as false
# why set to false?? because we don't want to mess up the pretrained weights of the model!!
base_model.trainable = False
base_model.summary()

In [None]:
#let's try to pass an image to the model to verify the output shape
for i,l in train_batches.take(1):
    pass
base_model(i).shape

In [None]:
model = Sequential()
model.add(base_model)
model.add(Layers.GlobalAveragePooling2D())
model.add(Layers.Dense(128))
model.add(Layers.Dropout(0.2))
model.add(Layers.Dense(1, activation = 'sigmoid'))
model.summary()

In [None]:
#add a earlystopping callback to stop the training if the model is not learning anymore
callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

#let's just choose adam as our optimizer, we all love adam anyway.
model.compile(optimizer='adam',
              loss = 'binary_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(train_batches, epochs=10, validation_data=test_batches, callbacks=[callbacks])

In [None]:
plt.figure(1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
scores = model.evaluate(final_test_data['class'], final_test_data['target'], verbose=0)
print("Erro da CNN: %.2f%%" % (100-scores[1]*100))
print("Acurária da CNN: %.2f%%" % (scores[1]))

In [None]:
#predict the test data
pred = model.predict_classes(np.array(test_arrays))

In [None]:
#let's print a classification report
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(test_data['target'], pred.flatten()))

In [None]:
## ohhh not that bad
### lets plot confusion matrix to make it look professional

con_mat = confusion_matrix(test_data['target'], pred.flatten())
plt.figure(figsize = (10,10))
plt.title('CONFUSION MATRIX')
sns.heatmap(con_mat, cmap='cividis',
            yticklabels=['Negative', 'Positive'],
            xticklabels=['Negative', 'Positive'],
            annot=True);