# Requirements

In [None]:
from google.colab import drive

from os import listdir
from os.path import isfile, join
import os.path
from os import path

import tarfile
import glob

import tensorflow as tf
import sklearn 
from sklearn.decomposition import PCA

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import scipy
from scipy import io

import PIL
from PIL import Image
import cv2
import seaborn as sns

In [None]:
drive.mount('/content/gdrive', force_remount=True)
PATH_PROJ = "/content/gdrive/MyDrive/AML-proj/" 
if not path.exists(PATH_PROJ):
    PATH_PROJ = "/content/gdrive/Shareddrives/AML-proj/"  # TODO: check if it is correct!

PATH_JPG = "/content/jpg/"
PATH_TAR = PATH_PROJ + "102flowers.tgz"
IMG_SIZE = 300
NUM_CLASSES = 102

NAMES = [
    "pink primrose", "hard-leaved pocket orchid", "canterbury bells",
    "sweet pea", "english marigold", "tiger lily", "moon orchid",
    "bird of paradise", "monkshood", "globe thistle", "snapdragon",
    "colt's foot", "king protea", "spear thistle", "yellow iris",
    "globe-flower", "purple coneflower", "peruvian lily", "balloon flower",
    "giant white arum lily", "fire lily", "pincushion flower", "fritillary",
    "red ginger", "grape hyacinth", "corn poppy", "prince of wales feathers",
    "stemless gentian", "artichoke", "sweet william", "carnation",
    "garden phlox", "love in the mist", "mexican aster", "alpine sea holly",
    "ruby-lipped cattleya", "cape flower", "great masterwort", "siam tulip",
    "lenten rose", "barbeton daisy", "daffodil", "sword lily", "poinsettia",
    "bolero deep blue", "wallflower", "marigold", "buttercup", "oxeye daisy",
    "common dandelion", "petunia", "wild pansy", "primula", "sunflower",
    "pelargonium", "bishop of llandaff", "gaura", "geranium", "orange dahlia",
    "pink-yellow dahlia?", "cautleya spicata", "japanese anemone",
    "black-eyed susan", "silverbush", "californian poppy", "osteospermum",
    "spring crocus", "bearded iris", "windflower", "tree poppy", "gazania",
    "azalea", "water lily", "rose", "thorn apple", "morning glory",
    "passion flower", "lotus", "toad lily", "anthurium", "frangipani",
    "clematis", "hibiscus", "columbine", "desert-rose", "tree mallow",
    "magnolia", "cyclamen", "watercress", "canna lily", "hippeastrum",
    "bee balm", "ball moss", "foxglove", "bougainvillea", "camellia", "mallow",
    "mexican petunia", "bromelia", "blanket flower", "trumpet creeper",
    "blackberry lily"
]

NAMES_ID = dict(zip(NAMES, [x for x in range(len(NAMES))]))
ID_NAMES = dict(zip([x for x in range(len(NAMES))], NAMES))

Mounted at /content/gdrive


# Functions

In [None]:
def execute_pca_on_imgs(img, show=False):
  # Splitting the image in R,G,B arrays.
  b, g, r = cv2.split(img) 
  #it will split the original image into Blue, Green and Red arrays.

  # it is mandatory to do feature scaling before applying PCA because PCA directions are highly sensitive to the relative ranges of features
  r_scaled = r / 255
  g_scaled = g / 255
  b_scaled = b / 255

  #initialize PCA with at least 95% variance  
  pca_r = PCA(0.95)
  pca_r_trans = pca_r.fit_transform(r_scaled)

  pca_g = PCA(0.95)
  pca_g_trans = pca_g.fit_transform(g_scaled)

  pca_b = PCA(0.95)
  pca_b_trans = pca_b.fit_transform(b_scaled)

  # inverse
  pca_r_org = pca_r.inverse_transform(pca_r_trans)
  pca_g_org = pca_g.inverse_transform(pca_g_trans)
  pca_b_org = pca_b.inverse_transform(pca_b_trans)

  # compressiong
  img_compressed = cv2.merge((pca_b_org, pca_g_org, pca_r_org))
  #viewing the compressed image
  if show: 
    plt.imshow(img_compressed)
    plt.show()

  return img_compressed
    

def execute_pca_on_imgs_set(df, path = PATH_JPG):
  for img_name in df["Id"]:
    RGB_img = plt.imread(path + img_name)
    # im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    im_pca = execute_pca_on_imgs(RGB_img)
    cv2.imwrite(PATH_PROJ + "jpg_pca/" + img_name, 255*im_pca, [cv2.IMWRITE_JPEG_QUALITY])
    

def processing_set(dataset, images, labels, size=224, return_pca=False):
  x, y = [], []
  for num_img in dataset:
    # print(f"linking {num_img} to {images[num_img - 1]}")
    path = PATH_JPG + images[num_img - 1]
    im=cv2.imread(path)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    im=cv2.resize(im,(size,size))
    x.append(im)
    y.append(labels[num_img - 1])
  
  if return_pca:
    return execute_pca_on_imgs_set(x), np.asarray(y) 
  else:
    return np.asarray(x), np.asarray(y)


def get_all_filenames(tar_fn):
    with tarfile.open(tar_fn) as f:
        return [m.name for m in f.getmembers() if m.isfile()]


def get_img_info(df, path_figure = PATH_JPG):
  w, h = [], []
  for index, path in enumerate(df["Id"].values):
    im = PIL.Image.open(path_figure+ str(path))
    w.append(im.size[0])
    h.append(im.size[1])

  w, h = np.array(w), np.array(h)
  return int(np.average(w)), int(np.average(h))



# Import Data

In [None]:
!rm -r $PATH_JPG # remove old data

rm: cannot remove '/content/jpg/': No such file or directory


In [None]:
# Import Dataset
%%capture
!tar -xvf $PATH_TAR -C '/content/'
images = [f for f in listdir(PATH_JPG) if isfile(join(PATH_JPG, f))]
images = sorted(images)

df = pd.DataFrame()
df['Id'] = images
df['Category'] = scipy.io.loadmat(PATH_PROJ + 'imagelabels.mat')['labels'][0] - 1 
df['Category'] = df['Category'].astype(int)
#df.head(5)

In [None]:
# Split Dataset con le rispettive label
split = scipy.io.loadmat(PATH_PROJ + 'setid.mat')
test_split = split["tstid"][0] - 1 # start from zero
train_split = split["trnid"][0] - 1
valid_split = split["valid"][0] - 1

train_set = df.iloc[train_split]
train_set['Category'].astype(int)
test_set = df.iloc[test_split]
test_set['Category'].astype(int)
val_set = df.iloc[valid_split]
val_set['Category'].astype(int)
print("Train set:", train_set.shape, "   Validation set:", val_set.shape, "   Test set:", test_set.shape)

Train set: (1020, 2)    Validation set: (1020, 2)    Test set: (6149, 2)


In [None]:
val_set.head()

Unnamed: 0,Id,Category
6772,image_06773.jpg,0
6766,image_06767.jpg,0
6738,image_06739.jpg,0
6748,image_06749.jpg,0
6762,image_06763.jpg,0


In [None]:
val_set.to_csv(PATH_PROJ+"val_set_df.csv", index=False)

# Preprocessing & Data Augmentation
Data augmentation artificially increases the size of the training set by
generating many realistic variants of each training instance. This
reduces overfitting, making this a regularization technique. The
generated instances should be as realistic as possible: ideally, given an
image from the augmented training set, a human should not be able to
tell whether it was augmented or not. Simply adding white noise will
not help; the modifications should be learnable (white noise is not).

For example, you can slightly shift, rotate, and resize every picture in
the training set by various amounts and add the resulting pictures to the
training set. This forces the model to be more
tolerant to variations in the position, orientation, and size of the objects
in the pictures. For a model that’s more tolerant of different lighting
conditions, you can similarly generate many images with various
contrasts. In general, you can also flip the pictures horizontally (except
for text, and other asymmetrical objects). By combining these
transformations, you can greatly increase the size of your training set.

If a categorical attribute has a large number of possible categories (e.g., country code, profession, species), then one-hot encoding will result in a large number of input features. This may slow down training and degrade performance. If this happens, you may want to replace the categorical input with useful numerical features related to the categories: for example, you could replace the ocean_proximity feature with the distance to the ocean
(similarly, a country code could be replaced with the country’s population and GDP per capita). Alternatively, you could replace each category with a learnable, low-dimensional vector called an embedding. Each category’s representation would be learned during training. This is an example of representation learning

**Contrast Augmentation:**

LAB color space expresses color variations across three channels. One channel for brightness and two channels for color:
- L-channel: representing lightness in the image
- a-channel: representing change in color between red and green
- b-channel: representing change in color between yellow and blue

In the following adaptive histogram equalization id performed on the L-channel and the resulting image is converted back to RGB color space. This enhances the brightness while also limiting contrast sensitivity

In [None]:
def contrast_augmentation(img):

    # converting to LAB color space
    lab= cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
    l_channel, a, b = cv2.split(lab)

    # Applying CLAHE to L-channel
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    cl = clahe.apply(l_channel)

    # merge the CLAHE enhanced L-channel with the a and b channel
    limg = cv2.merge((cl,a,b))

    # Converting image from LAB Color model to BGR color spcae
    enhanced_img = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB)

    # Stacking the original image with the enhanced image
    return np.hstack((img, enhanced_img))

In [None]:
## remoto to execute
# %%script false --no-raise-error

from keras.preprocessing.image import ImageDataGenerator
from IPython.display import clear_output 


def oversample(df, n, dir, img_size):
    
    tmp_df=df.copy()
    tmp_df["Id"] = dir+tmp_df["Id"]
    
    # create and store the augmented images  
    total=0
    gen=ImageDataGenerator(rotation_range = 50,
                                shear_range=0.2,
                                zoom_range=[0.75,1.25],
                                brightness_range=[0.5, 1.5],
                                width_shift_range=0.1,
                                height_shift_range=0.1,
                                horizontal_flip=True)

    groups=tmp_df.groupby('Category') # group by class

    for label in tmp_df['Category'].unique():  # for every class               
        
        group=groups.get_group(label)  # a dataframe holding only rows with the specified label 
        sample_count=len(group)   # determine how many samples there are in this class  
        
        if sample_count< n: # if the class has less than target number of images
            
            aug_img_count=0
            delta=n - sample_count  # number of augmented images to create
            
            prefix = 'aug-'+str(label).zfill(3)+"-"

            # augmentation parameters
            # The second one, flow_from_dataframe will be very useful to us. 
            # It checks the path available on the dataframe and then automatically search for the image in train directory. 
            # Then it make the desired preprocessing steps available in ImageDataGenerator
            aug_gen=gen.flow_from_dataframe(group,  
                                            x_col='Id', y_col=None, 
                                            target_size=img_size,
                                            class_mode=None, 
                                            batch_size=1, 
                                            shuffle=False, 
                                            save_to_dir=dir, 
                                            save_prefix=prefix, 
                                            color_mode='rgb',
                                            save_format='jpg')
            
            # new images creation
            while aug_img_count<delta:
                images=next(aug_gen)            
                aug_img_count += len(images)
            total +=aug_img_count

            # dataframe updating
            for file in os.listdir(dir):
                if file.startswith(prefix):
                    df.loc[df.index.max()+1] = [file, label]
            
            clear_output()
            print(str(label+1).zfill(3), " / 102 classes augmented")

    print('Total Augmented images created= ', total)
    return df

threshold=100
img_size=(IMG_SIZE,IMG_SIZE)
train_set = oversample(train_set, threshold, "/content/jpg/", img_size)

102  / 102 classes augmented
Total Augmented images created=  9180


In [None]:
# Copy new augmented dataset into a Drive directory

%cp -r /content/jpg /content/gdrive/MyDrive/AML-proj/jpg_augmented300
train_set.to_csv("/content/gdrive/MyDrive/AML-proj/train_set_augmented300.csv", index=False)