## Data preprocessing

In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import sklearn
import tqdm
%matplotlib inline

**Loading the labels related to the given images.**

In [2]:
# Loading the dataset
labels = pd.read_csv("./dataset/text dataset/dataset.csv")

In [3]:
labels.head()

Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Nodule,Pneumothorax,Mass,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Fibrosis,Edema,Pneumonia,Hernia
0,00000001_000.png,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,00000001_001.png,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2,00000001_002.png,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,00000002_000.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,00000003_000.png,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


**Deleting the class labels that the given images are way too small with respect to other image classes. It is important to have a relatively equall number of images per class label.**

In [4]:
labels = labels.drop(labels = ['Nodule', 'Mass', 'Consolidation', 'Pleural_Thickening', 'Cardiomegaly',
                               'Emphysema', 'Fibrosis', 'Edema', 'Pneumonia', 'Hernia'], axis = 1)

In [5]:
labels.head()

Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000001_000.png,0,0,0,0,0
1,00000001_001.png,0,0,0,0,0
2,00000001_002.png,0,0,0,1,0
3,00000002_000.png,1,0,0,0,0
4,00000003_000.png,0,0,0,0,0


In [6]:
# Getting the number of images per class
for i_disease in labels.columns[1:]:
    print(i_disease + ":", sum(labels[i_disease]))

No Finding: 60361
Infiltration: 19894
Atelectasis: 11559
Effusion: 13317
Pneumothorax: 5302


**First augment the images with rotation of 15, 90, 180, 270 and the flip all those images. In general we make our dataset 10 times more. At the end get the labels with their corresponding class.**

In [7]:
# Loading the augmented dataset
labels_augmented = pd.read_csv("./dataset/text dataset/dataset_augmented.csv")

In [8]:
labels_augmented.head()

Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000005_006_180degree.jpg,0,1,0,0,0
1,00000005_006_270degree.jpg,0,1,0,0,0
2,00000005_006_90degree.jpg,0,1,0,0,0
3,00000005_006_flipped.jpg,0,1,0,0,0
4,00000005_006_flipped_180degree.jpg,0,1,0,0,0


In [35]:
# Getting the number of images per class
for i_disease in labels_augmented.columns[1:]:
    print(i_disease + ":", sum(labels_augmented[i_disease]))

No Finding: 0
Infiltration: 61943
Atelectasis: 27916
Effusion: 25816
Pneumothorax: 27832


**The goal is to have 35K images per label with having at least 300 images deviation.**

In [12]:
# Getting a copy of true labels
dataset = copy.copy(labels)

In [18]:
# Adding augmented "Infiltration" to our dataset
print("We have to add {} Infiltration augmented image to our dataset!".format(35000-19894))
augmented_infiltration_to_add =  labels_augmented[labels_augmented['Infiltration'] == 1][:35000-19894]
dataset = dataset.append(augmented_infiltration_to_add)
dataset

We have to add 15106 Infiltration augmented image to our dataset!


Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000001_000.png,0,0,0,0,0
1,00000001_001.png,0,0,0,0,0
2,00000001_002.png,0,0,0,1,0
3,00000002_000.png,1,0,0,0,0
4,00000003_000.png,0,0,0,0,0
5,00000003_001.png,0,0,0,0,0
6,00000003_002.png,0,0,0,0,0
7,00000003_003.png,0,1,0,0,0
8,00000003_004.png,0,0,0,0,0
9,00000003_005.png,0,0,0,0,0


In [20]:
# Adding augmented "Atelectasis" to our dataset
print("We have to add {} Atelectasis augmented image to our dataset!".format(35000-11559))
augmented_atelectasis_to_add = labels_augmented[labels_augmented['Atelectasis'] == 1][:35000-11559]
dataset = dataset.append(augmented_atelectasis_to_add)
dataset

We have to add 23441 Atelectasis augmented image to our dataset!


Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000001_000.png,0,0,0,0,0
1,00000001_001.png,0,0,0,0,0
2,00000001_002.png,0,0,0,1,0
3,00000002_000.png,1,0,0,0,0
4,00000003_000.png,0,0,0,0,0
5,00000003_001.png,0,0,0,0,0
6,00000003_002.png,0,0,0,0,0
7,00000003_003.png,0,1,0,0,0
8,00000003_004.png,0,0,0,0,0
9,00000003_005.png,0,0,0,0,0


In [21]:
# Adding augmented "Effusion" to our dataset
print("We have to add {} Effusion augmented image to our dataset!".format(35000-13317))
augmented_effusion_to_add = labels_augmented[labels_augmented['Effusion'] == 1][:35000-13317]
dataset = dataset.append(augmented_effusion_to_add)
dataset

We have to add 21683 Effusion augmented image to our dataset!


Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000001_000.png,0,0,0,0,0
1,00000001_001.png,0,0,0,0,0
2,00000001_002.png,0,0,0,1,0
3,00000002_000.png,1,0,0,0,0
4,00000003_000.png,0,0,0,0,0
5,00000003_001.png,0,0,0,0,0
6,00000003_002.png,0,0,0,0,0
7,00000003_003.png,0,1,0,0,0
8,00000003_004.png,0,0,0,0,0
9,00000003_005.png,0,0,0,0,0


In [22]:
# Adding augmented "Pneumothorax" to our dataset
print("We have to add {} Pneumothorax augmented image to our dataset!".format(35000-5302))
augmented_pneumothorax_to_add = labels_augmented[labels_augmented['Pneumothorax'] == 1]
dataset = dataset.append(augmented_pneumothorax_to_add)
dataset

We have to add 29698 Pneumothorax augmented image to our dataset!


Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000001_000.png,0,0,0,0,0
1,00000001_001.png,0,0,0,0,0
2,00000001_002.png,0,0,0,1,0
3,00000002_000.png,1,0,0,0,0
4,00000003_000.png,0,0,0,0,0
5,00000003_001.png,0,0,0,0,0
6,00000003_002.png,0,0,0,0,0
7,00000003_003.png,0,1,0,0,0
8,00000003_004.png,0,0,0,0,0
9,00000003_005.png,0,0,0,0,0


In [23]:
dataset.to_csv("./dataset/text dataset/final_dataset.csv")

In [37]:
data_2 = sklearn.utils.resample(dataset[dataset['No Finding'] == 1])[:35000]
data_2

Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
4829,00001298_005.png,1,0,0,0,0
28575,00007452_001.png,1,0,0,0,0
45294,00011608_000.png,1,0,0,0,0
44567,00011460_041.png,1,0,0,0,0
3359,00000885_000.png,1,0,0,0,0
8639,00002285_004.png,1,0,0,0,0
19049,00005066_045.png,1,0,0,0,0
12804,00003361_012.png,1,0,0,0,0
8143,00002138_000.png,1,0,0,0,0
90547,00022543_001.png,1,0,0,0,0


In [38]:
data_1 = dataset[dataset['No Finding'] == 0]
data_1

Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000001_000.png,0,0,0,0,0
1,00000001_001.png,0,0,0,0,0
2,00000001_002.png,0,0,0,1,0
4,00000003_000.png,0,0,0,0,0
5,00000003_001.png,0,0,0,0,0
6,00000003_002.png,0,0,0,0,0
7,00000003_003.png,0,1,0,0,0
8,00000003_004.png,0,0,0,0,0
9,00000003_005.png,0,0,0,0,0
10,00000003_006.png,0,0,0,0,0


In [40]:
dataset = data_1.append(data_2)

In [41]:
dataset.to_csv("./dataset/text dataset/final_dataset.csv")

In [49]:
dataset.iloc[2, :]

Image Index     00000001_002.png
No Finding                     0
Infiltration                   0
Atelectasis                    0
Effusion                       1
Pneumothorax                   0
Name: 2, dtype: object

In [87]:
dataset = pd.DataFrame.reset_index(dataset)
dataset = dataset.drop(["index", "level_0"], axis = 1)
dataset

Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000001_000.png,0,0,0,0,0
1,00000001_001.png,0,0,0,0,0
2,00000001_002.png,0,0,0,1,0
3,00000003_000.png,0,0,0,0,0
4,00000003_001.png,0,0,0,0,0
5,00000003_002.png,0,0,0,0,0
6,00000003_003.png,0,1,0,0,0
7,00000003_004.png,0,0,0,0,0
8,00000003_005.png,0,0,0,0,0
9,00000003_006.png,0,0,0,0,0


In [88]:
rows_to_drop = []
for i_row in tqdm.tqdm(range(len(dataset))):
    if 1 not in list(dataset.iloc[i_row, :]):
        rows_to_drop.append(i_row)

100%|██████████| 174821/174821 [00:47<00:00, 3658.84it/s]


In [91]:
dataset = dataset.drop(index = rows_to_drop)
dataset

Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
2,00000001_002.png,0,0,0,1,0
6,00000003_003.png,0,1,0,0,0
12,00000005_006.png,0,1,0,0,0
13,00000005_007.png,0,1,0,1,0
17,00000010_000.png,0,1,0,0,0
18,00000011_000.png,0,0,0,1,0
19,00000011_005.png,0,1,0,0,0
20,00000011_006.png,0,0,1,0,0
21,00000011_007.png,0,1,0,0,0
22,00000012_000.png,0,0,0,1,0


In [95]:
dataset = pd.DataFrame.reset_index(dataset)
dataset = dataset.drop("index", axis = 1)
dataset

Unnamed: 0,Image Index,No Finding,Infiltration,Atelectasis,Effusion,Pneumothorax
0,00000001_002.png,0,0,0,1,0
1,00000003_003.png,0,1,0,0,0
2,00000005_006.png,0,1,0,0,0
3,00000005_007.png,0,1,0,1,0
4,00000010_000.png,0,1,0,0,0
5,00000011_000.png,0,0,0,1,0
6,00000011_005.png,0,1,0,0,0
7,00000011_006.png,0,0,1,0,0
8,00000011_007.png,0,1,0,0,0
9,00000012_000.png,0,0,0,1,0


In [97]:
dataset.to_csv("./dataset/text dataset/4. dataset.csv")

In [98]:
dataset = pd.read_csv("./dataset/text dataset/4. dataset.csv")