## Preprocessing - Skenario Praktikum Modul 4

### Define Direktori Data

In [1]:
import os

In [2]:
base_dir = '/content/drive/MyDrive/Pembelajaran Mesin/Tugas Kelompok Praktikum/Dataset/chest_xray'

splitted_dir = os.path.join(base_dir, 'splitted_data')

train_dir = os.path.join(splitted_dir, 'train')
norm_train_dir = os.path.join(train_dir, 'normal')
pneu_train_dir = os.path.join(train_dir, 'pneumonia')

val_dir = os.path.join(splitted_dir, 'val')
norm_val_dir = os.path.join(val_dir, 'normal')
pneu_val_dir = os.path.join(val_dir, 'pneumonia')

test_dir = os.path.join(splitted_dir, 'test')
norm_test_dir = os.path.join(test_dir, 'normal')
pneu_test_dir = os.path.join(test_dir, 'pneumonia')

In [3]:
print("Total Training NORMAL:", len(os.listdir(norm_train_dir)))
print("Total Training PNEUMONIA:", len(os.listdir(pneu_train_dir)))
print('-'*30)
print("Total validation NORMAL:", len(os.listdir(norm_val_dir)))
print("Total validation PNEUMONIA:", len(os.listdir(pneu_val_dir)))
print('-'*30)
print("Total Testing NORMAL:", len(os.listdir(norm_test_dir)))
print("Total Testing PNEUMONIA:", len(os.listdir(pneu_test_dir)))

Total Training NORMAL: 1266
Total Training PNEUMONIA: 3418
------------------------------
Total validation NORMAL: 300
Total validation PNEUMONIA: 811
------------------------------
Total Testing NORMAL: 17
Total Testing PNEUMONIA: 44


### Periksa Ratio Split Data

In [12]:
train_data_count = len(os.listdir(norm_train_dir)) + len(os.listdir(pneu_train_dir))
val_data_count = len(os.listdir(norm_val_dir)) + len(os.listdir(pneu_val_dir))
test_data_count = len(os.listdir(norm_test_dir)) + len(os.listdir(pneu_test_dir))
total_data_count = train_data_count + val_data_count + test_data_count
    
print("Persentase Pembagian Data:")
print(f"\ttrain = {train_data_count / total_data_count * 100:.0f} %")
print(f"\tval = {val_data_count / total_data_count * 100:.0f} %")
print(f"\ttest = {test_data_count / total_data_count * 100:.0f} %")

Persentase Pembagian Data:
	train = 80 %
	val = 19 %
	test = 1 %


### Augmentasi
menggunakan **imagedatagenerator**

In [5]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [6]:
# data augmentation
new_size = (180,180)

train_val_gen = ImageDataGenerator(rescale = 1./255.,
                                   rotation_range=30,
                                   zoom_range=0.4,
                                   vertical_flip=True,
                                   horizontal_flip=True)

train_generator = train_val_gen.flow_from_directory(train_dir,
                                  target_size=new_size,
                                  shuffle=True,
                                  class_mode='binary'
                                  )

val_generator = train_val_gen.flow_from_directory(val_dir,
                                  target_size=new_size,
                                  shuffle=True,
                                  class_mode='binary'
                                  )

test_gen = ImageDataGenerator()

test_generator = test_gen.flow_from_directory(test_dir,
                                  target_size=new_size,
                                  shuffle=False,
                                  class_mode='binary'
                                  )

Found 4684 images belonging to 2 classes.
Found 1111 images belonging to 2 classes.
Found 61 images belonging to 2 classes.


### Penentuan bobot kelas
untuk mengatasi permasalahan pada imbalanced data

In [13]:
norm_count = len(os.listdir(norm_train_dir)) + len(os.listdir(norm_val_dir)) + len(os.listdir(norm_test_dir))
pneu_count = len(os.listdir(pneu_train_dir)) + len(os.listdir(pneu_val_dir)) + len(os.listdir(pneu_test_dir))

print(f"Jumlah Normal: {norm_count}\nJumlah Pneoumonia: {pneu_count}")

Jumlah Normal: 1583
Jumlah Pneoumonia: 4273


In [14]:
weight_for_0 = (1 / norm_count) * (train_data_count) / 2.0 
weight_for_1 = (1 / pneu_count) * (train_data_count) / 2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print(f"Bobot kelas 0: {weight_for_0:.2f}")
print(f"Bobot kelas 1: {weight_for_1:.2f}")

Bobot kelas 0: 1.48
Bobot kelas 1: 0.55


### Simpan hasil preps

In [9]:
import numpy as np

In [18]:
x_train = np.concatenate([train_generator.next()[0] for _ in range(train_generator.__len__())])
y_train = np.concatenate([train_generator.next()[1] for _ in range(train_generator.__len__())])

print(x_train.shape)
print(y_train.shape)

(4684, 180, 180, 3)
(4684,)


In [17]:
x_val = np.concatenate([val_generator.next()[0] for _ in range(val_generator.__len__())])
y_val = np.concatenate([val_generator.next()[1] for _ in range(val_generator.__len__())])

print(x_val.shape)
print(y_val.shape)

(1111, 180, 180, 3)
(1111,)


In [16]:
x_test = np.concatenate([test_generator.next()[0] for _ in range(test_generator.__len__())])
y_test = np.concatenate([test_generator.next()[1] for _ in range(test_generator.__len__())])

print(x_test.shape)
print(y_test.shape)

(61, 180, 180, 3)
(61,)


In [19]:
import json

target_dir = '/content/drive/MyDrive/Pembelajaran Mesin/Tugas Kelompok Praktikum/Skenario Praktikum/Modul 4/Preprocessing'

preprocessed_data = [x_train, y_train, x_test, y_test, x_val, y_val]
preprocessed_file_name = ['x_train', 'y_train', 'x_test', 'y_test', 'x_val', 'y_val']

for i, file_name in enumerate(preprocessed_file_name):
    np.save(target_dir + '/' + file_name + '.npy', preprocessed_data[i])

class_weight_file = os.path.join(target_dir, 'class_weight.json')
json.dump(class_weight, open(class_weight_file, 'w'))