<a href="https://colab.research.google.com/github/ricglz/CE888_activities/blob/main/assignment/Data_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! [ ! -z "$COLAB_GPU" ] && pip install torch torchvision kaggleDownloader

# Preparations

----

In [None]:
from os import path, mkdir, remove
from shutil import move
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np

from torch.utils.data import Subset
from torchvision.datasets import ImageFolder
import torchvision.utils as t_utils
import torchvision.transforms as T

try:
    from kaggleDownloader import get_dataset
    from google.colab import drive
    in_colab = True
except ImportError:
    in_colab = False

In [None]:
if in_colab:
    drive_path = '/content/gdrive'
    drive.mount(drive_path, force_remount=False)

In [None]:
data_path = 'MyDrive/Essex/Datasets/Flame' if in_colab else './Flame'

In [None]:
classes = ['Fire', 'No_Fire']
training_path = path.join(data_path, 'Training')
test_path = path.join(data_path, 'Test')
resize = T.Resize((224, 224))

In [None]:
def save_image(img, label, index, prefix):
    klass = classes[label]
    img_path = path.join(training_path, f'{klass}/{prefix}_{index}.png')
    t_utils.save_image(img, img_path)

# Resize images to the desired size

---

In [None]:
def resize_dataset(ds_path):
    ds = ImageFolder(ds_path, T.Compose([resize, T.ToTensor()]))
    for index, (img_path, _) in enumerate(tqdm(ds.imgs)):
        t_utils.save_image(ds[index][0], img_path)

In [None]:
resize_dataset(training_path)

In [None]:
resize_dataset(test_path)

# Add new dataset to the existent one

----

One of the principal problems of the current dataset is that the images are basically a burst of shots of the same environment, this leads to the problem that the model learns to recognize the environment instead of recognizing fire

To avoid this we will use another [dataset that can be found in kaggle](https://www.kaggle.com/phylake1337/fire-dataset). Which may help the model to actually recognize the fire

In [None]:
def add_kaggle_dataset():
    get_dataset('kaggle datasets download -d phylake1337/fire-dataset')
    transforms = T.Compose([resize, T.ToTensor()])
    extra_data = ImageFolder('/content/fire_dataset', transforms)
    for index, (img, label) in enumerate(tqdm(extra_data)):
        save_image(img, label, index, 'extra_dataset')

In [None]:
add_kaggle_dataset()

# Balance datasets

----

In [None]:
def clear_balanced():
    balance_imgs = glob(
        f'{training_path}/**/balance*.png', recursive=True)
    for balance_img in tqdm(balance_imgs):
        remove(balance_img)

In [None]:
clear_balanced()

In [None]:
def get_minor_klass(train_ds):
    targets = np.array(train_ds.targets)
    fire_data_count = np.count_nonzero(targets == 0)
    non_fire_data_count = np.count_nonzero(targets == 1)
    klass_counts = [fire_data_count, non_fire_data_count]
    minor_klass = np.argmin(klass_counts)
    minor_count, max_count = min(klass_counts), max(klass_counts)
    images_to_save = min(max_count - minor_count, minor_count)
    return minor_klass, images_to_save

In [None]:
def balance_dataset():
    transforms = T.Compose([
      resize,
      T.ColorJitter(brightness=0.25, contrast=0.25),
      T.RandomRotation(degrees=5),
      T.RandomHorizontalFlip(),
      T.RandomVerticalFlip(),
      T.ToTensor(),
    ])
    train_ds = ImageFolder(training_path, transforms)
    minor_klass, images_to_save = get_minor_klass(train_ds)
    indexes_to_enhace = np.where(train_ds.targets == minor_klass)[0]
    assert train_ds.targets[indexes_to_enhace[0]] == minor_klass
    indexes_to_enhace = np.random.choice(indexes_to_enhace, images_to_save, replace=False)
    assert len(indexes_to_enhace) == images_to_save
    for save_img_index, index in enumerate(tqdm(indexes_to_enhace)):
        img, label = train_ds[index]
        save_image(img, label, save_img_index, 'balance')

In [None]:
balance_dataset()

# Reduce the amount of data

In [None]:
def half_the_data():
    transforms = T.Compose([resize, T.ToTensor()])
    train_ds = ImageFolder(training_path, transforms)
    files = list(map(lambda a: a[0], train_ds.samples))
    _, erase_files = train_test_split(
        files, test_size=0.5, shuffle=True, stratify=train_ds.targets)
    for file_to_erase in tqdm(erase_files):
        remove(file_to_erase)

In [None]:
half_the_data()

# Split training dataset

In [None]:
validation_path = path.join(data_path, 'Validation')
if not path.exists(validation_path):
    mkdir(validation_path)
    for klass in classes:
        mkdir(path.join(validation_path, klass))

In [None]:
def split_training_dataset():
    train_ds = ImageFolder(training_path)
    targets = train_ds.targets
    _, valid_idx= train_test_split(
        np.arange(len(targets)), test_size=0.2,
        shuffle=True, stratify=targets)
    for idx in tqdm(valid_idx):
        img_path, label = train_ds.imgs[idx]
        filename = Path(img_path).name
        klass = classes[label]
        new_path = path.join(validation_path, klass, filename)
        move(img_path, new_path)

In [None]:
split_training_dataset()

# Check final count

----

In [None]:
train_ds = ImageFolder(training_path)
targets = np.array(train_ds.targets)

In [None]:
fire_data_count = np.count_nonzero(targets == 0)
f'Fire data: {fire_data_count}'

In [None]:
non_fire_data_count = np.count_nonzero(targets == 1)
f'Non-Fire data: {non_fire_data_count}'

In [None]:
f'Total: {fire_data_count + non_fire_data_count}'