In [1]:
import random
import imghdr
import shutil
import os
import tensorflow as tf
import pandas as pd
import numpy as np

## Data preprocessing

In [2]:
os.chdir('/home/dell/Documentos/rn-cin-covid/')


# lê arquivos .txt gerados pelo COVID-Net
train_labels = pd.read_csv("data/train_split.txt", sep=" ",header=None)
train_labels_gauss = pd.read_csv("data/train_labels_gauss.txt", sep=" ",header=None)
test_labels = pd.read_csv("data/test_split.txt", sep=" ",header=None)

# renomeia colunas
train_labels_gauss.columns = ['id', 'image', 'label', 'database']
train_labels.columns = ['id', 'image', 'label', 'database']
test_labels.columns = ['id', 'image', 'label', 'database']


In [3]:
# mantendo as colunas que importam

train_labels_gauss = train_labels_gauss[['image','label']]
train_labels = train_labels[['image','label']]
test_labels = test_labels[['image','label']]

train_labels['label'].value_counts()

negative    15068
positive     2158
Name: label, dtype: int64

In [4]:
train_labels_gauss['label'].value_counts()

negative    15068
positive     2158
Name: label, dtype: int64

### Validação

In [5]:
# cria dataset de validação
np.random.seed(47)

val_labels = train_labels.sample(frac=0.2)
val_labels.shape

(3445, 2)

In [6]:
val_labels['label'].value_counts()

negative    3004
positive     441
Name: label, dtype: int64

In [7]:
# separa treino de validação

new_train_image = []
new_train_label = []

for i in range(train_labels.shape[0]):
    
    if train_labels['image'][i] not in val_labels['image'].tolist():
        new_train_image.append(train_labels['image'][i])
        new_train_label.append(train_labels['label'][i])
        

new_train_dataset = pd.DataFrame()
new_train_dataset['image'] = new_train_image
new_train_dataset['label'] = new_train_label

train_labels = new_train_dataset.copy()
train_labels.shape

(13781, 2)

### Copiando index para ruido gaussiano

In [8]:
val_labels_gauss = train_labels_gauss.loc[val_labels.index]

In [9]:
val_labels_gauss.shape

(3445, 2)

In [10]:
val_labels_gauss['label'].value_counts()

negative    3004
positive     441
Name: label, dtype: int64

In [11]:
# separa treino de validação

new_train_image = []
new_train_label = []

for i in range(train_labels_gauss.shape[0]):
    
    if train_labels_gauss['image'][i] not in val_labels_gauss['image'].tolist():
        new_train_image.append(train_labels_gauss['image'][i])
        new_train_label.append(train_labels_gauss['label'][i])
        

new_train_dataset = pd.DataFrame()
new_train_dataset['image'] = new_train_image
new_train_dataset['label'] = new_train_label

train_labels_gauss = new_train_dataset.copy()
train_labels_gauss.shape

(13781, 2)

### Distribuição das classes

In [12]:
train_labels['label'].value_counts()

negative    12064
positive     1717
Name: label, dtype: int64

In [13]:
train_labels_gauss['label'].value_counts()

negative    12064
positive     1717
Name: label, dtype: int64

### Subsampling

In [14]:
negative_train = train_labels.loc[train_labels['label'] == 'negative'].sample(2500)
positive_train = train_labels.loc[train_labels['label'] == 'positive']
train_labels = pd.concat([negative_train, positive_train])

In [15]:
negative_val = val_labels.loc[val_labels['label'] == 'negative'].sample(650)
positive_val = val_labels.loc[val_labels['label'] == 'positive']
val_labels = pd.concat([negative_val, positive_val])

In [16]:
train_labels['label'].value_counts()

negative    2500
positive    1717
Name: label, dtype: int64

In [17]:
test_labels['label'].value_counts()

positive    291
negative    100
Name: label, dtype: int64

In [18]:
val_labels['label'].value_counts()

negative    650
positive    441
Name: label, dtype: int64

### Usando os index para separar ruido gaussiano

In [19]:
val_labels_gauss = val_labels_gauss.loc[val_labels.index]
train_labels_gauss = train_labels_gauss.loc[train_labels.index]

In [21]:
train_labels_gauss['label'].value_counts()

negative    2500
positive    1717
Name: label, dtype: int64

In [20]:
val_labels_gauss['label'].value_counts()

negative    650
positive    441
Name: label, dtype: int64

### Juntando os datasets gaussianos e normais

In [22]:
train_labels = pd.concat([train_labels, train_labels_gauss])
val_labels = pd.concat([val_labels,val_labels_gauss])

In [23]:
train_labels['label'].value_counts()

negative    5000
positive    3434
Name: label, dtype: int64

In [24]:
val_labels['label'].value_counts()

negative    1300
positive     882
Name: label, dtype: int64

## Movendo os arquivos para os diretórios corretos

In [25]:
os.chdir('/home/dell/Documentos/COVID-Net/data/train/') # Diretório do treino gerado pelo COVID-Net

dir_train = '/home/dell/Documentos/rn-cin-covid/data/train/' # diretório final treino
dir_test = '/home/dell/Documentos/rn-cin-covid/data/test/' # diretório final teste
dir_val = '/home/dell/Documentos/rn-cin-covid/data/val/' # diretório final validação

# removendo diretórios passados

shutil.rmtree(dir_train)
shutil.rmtree(dir_test)
shutil.rmtree(dir_val)


In [27]:
# Criando diretórios novos

for path in [dir_train, dir_test, dir_val]:
    for lb in ['positive', 'negative']:
        path_lb = os.path.join(path, lb)
        try:
            os.makedirs(path_lb)
        except OSError:
            print ("Creation of the directory %s failed" % path_lb)
        else:
            print ("Successfully created the directory %s" % path_lb)

Successfully created the directory /home/dell/Documentos/rn-cin-covid/data/train/positive
Successfully created the directory /home/dell/Documentos/rn-cin-covid/data/train/negative
Successfully created the directory /home/dell/Documentos/rn-cin-covid/data/test/positive
Successfully created the directory /home/dell/Documentos/rn-cin-covid/data/test/negative
Successfully created the directory /home/dell/Documentos/rn-cin-covid/data/val/positive
Successfully created the directory /home/dell/Documentos/rn-cin-covid/data/val/negative


In [None]:

# Copia treino e validação
for f in os.listdir():
    if str(f) in train_labels['image'].tolist():
        
        index_train = train_labels[train_labels['image']==f].index.values
        label = train_labels['label'].loc[index_train].values[0]
        shutil.copy(f, os.path.join(dir_train, label))
    
    elif str(f) in val_labels['image'].tolist():
        
        index_val = val_labels[val_labels['image']==f].index.values
        label = val_labels['label'].loc[index_val].values[0]
        shutil.copy(f, os.path.join(dir_val, label))
        

In [None]:
os.chdir('/home/dell/Documentos/COVID-Net/data/test/')


# Copia teste

for f in os.listdir():
    if str(f) in test_labels['image'].tolist():
        
        index_test = test_labels[test_labels['image']==f].index.values
        label = test_labels['label'].loc[index_test].values[0]
        shutil.copy(f, os.path.join(dir_test, label))
    else:
        print('?')
        print(f)
        


In [None]:
from tensorflow.keras.preprocessing import image_dataset_from_directory
test_dir = "/home/dell/Documentos/rn-cin-covid/data/train/"

train_dataset = image_dataset_from_directory(test_dir,
                                                  shuffle=True,
                                                  labels="inferred",
                                                  label_mode="int",
                                                  batch_size=32,
                                                  image_size=(150, 150))
