# CAI4104 Final Project: Data Preprocessing

In [3]:
# Uncomment this to install opencv
#%conda install conda-forge::opencv
import os
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import cv2
import utils

# Let's check our software versions
print('------------')
print('### Python version: ' + __import__('sys').version)
print(f'### NumPy version: {np.__version__}')
print(f'### Scikit-learn version: {sklearn.__version__}')
print(f'### Tensorflow version: {tf.__version__}')
print('------------')


------------
### Python version: 3.9.12 | packaged by conda-forge | (main, Mar 24 2022, 23:25:59) 
[GCC 10.3.0]
### NumPy version: 1.22.3
### Scikit-learn version: 0.24.2
### Tensorflow version: 2.7.0
------------


## Download dataset from Kaggle

In [4]:
kaggle_token_path = os.path.dirname(os.getcwd())+"/token"
data_path = os.path.dirname(os.getcwd())+"/data"
os.environ["KAGGLE_CONFIG_DIR"] = kaggle_token_path
os.chmod(kaggle_token_path+"/kaggle.json", 0o600)

import kaggle

current_dir = os.getcwd()
os.chdir(data_path)
!kaggle datasets download ananthu017/emotion-detection-fer --unzip
os.chdir(current_dir)

Dataset URL: https://www.kaggle.com/datasets/ananthu017/emotion-detection-fer
License(s): CC0-1.0
Downloading emotion-detection-fer.zip to /home/adam.benali/EmotionDetector/data
 94%|███████████████████████████████████▌  | 61.0M/65.2M [00:00<00:00, 82.0MB/s]
100%|██████████████████████████████████████| 65.2M/65.2M [00:01<00:00, 67.9MB/s]


## File to nparray conversion

In [5]:
# Relative path to data
data_path = '../data/'

# Category names (implicitly stores labels as indices)
categories = ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
num_categories = len(categories)

In [6]:
data_list = []
label_list = []


for (root, dirs, files) in os.walk(data_path):
    # Get top directory name
    last_slash = root.rfind('/') + 1
    dir_name = root[last_slash:]

    # Get associated label
    label = -1
    try:
        label = categories.index(dir_name)
    except ValueError:
        None

    # Not an image directory
    if label == -1:
        continue
    
    # Progress bar reset
    bar = utils.ProgressBar(25, f'Loading images from {root:24}')
    num_imgs = len(files)

    # Add all files for current dir
    for i, f in enumerate(files):
        # Load image data
        filepath = root+'/'+f
        img = cv2.imread(filepath)
        
        # Add to list
        data_list.append(img)
        label_list.append(label)
        # Update progress bar
        bar.update_display((i + 1) / num_imgs)


print('Loaded all images.')


Loading images from ../data/train/sad       [#########################] 
Loading images from ../data/train/neutral   [#########################] 
Loading images from ../data/train/fearful   [#########################] 
Loading images from ../data/train/happy     [#########################] 
Loading images from ../data/train/angry     [#########################] 
Loading images from ../data/train/surprised [#########################] 
Loading images from ../data/train/disgusted [#########################] 
Loading images from ../data/test/sad        [#########################] 
Loading images from ../data/test/neutral    [#########################] 
Loading images from ../data/test/fearful    [#########################] 
Loading images from ../data/test/happy      [#########################] 
Loading images from ../data/test/angry      [#########################] 
Loading images from ../data/test/surprised  [#########################] 
Loading images from ../data/test/disgusted  [######

## Sorting and One-Hot Encoding

In [7]:
# Convert data to np arrays and sort
all_t_num = np.array(label_list)
resort_inds = np.argsort(all_t_num)
all_t_num = all_t_num[resort_inds]

all_x_bgr = np.array(data_list)
all_x = np.average(all_x_bgr, axis=3)[resort_inds]
all_x = all_x.reshape(*all_x.shape, 1)

# One-hot encoding
all_t = keras.utils.to_categorical(all_t_num, num_classes=num_categories)

num_images = all_x.shape[0]
image_shape = all_x.shape[1:]

all_x.shape, all_t.shape

((35887, 48, 48, 1), (35887, 7))

## Data scaling and partitioning

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Train, validation, test proportions
prop_vec = np.array([0.7, 0.15, 0.15])
seed = 42

# Flattened data
all_x_flat = all_x.reshape(all_x.shape[0], image_shape[0]*image_shape[1])

# Train-other split
train_prop = prop_vec[0]
train_x_unscaled, tmp_x_unscaled, train_t, tmp_t = train_test_split(all_x_flat, all_t, train_size=train_prop, random_state=seed)

#! Special image case? Should we just scale fitted to all data because we know all features lie on 0-255?
# Fit only on training
scaler = MinMaxScaler()
scaler.fit(all_x_flat)
# Transform both splits
train_x_flat = scaler.transform(train_x_unscaled)
tmp_x_flat = scaler.transform(tmp_x_unscaled)

# Validation-test split
val_prop = prop_vec[1] / (1 - train_prop)
val_x_flat, test_x_flat, val_t, test_t = train_test_split(tmp_x_flat, tmp_t, train_size=val_prop, random_state=seed)

# Reshape into images
train_x = train_x_flat.reshape(train_x_flat.shape[0], *image_shape)
val_x = val_x_flat.reshape(val_x_flat.shape[0], *image_shape)
test_x = test_x_flat.reshape(test_x_flat.shape[0], *image_shape)

print('Image Data')
print('-'*36)
print(f'Training Data:   {train_x.shape}')
print(f'Validation Data: {val_x.shape}')
print(f'Test Data:       {test_x.shape}')

Image Data
------------------------------------
Training Data:   (25120, 48, 48, 1)
Validation Data: (5383, 48, 48, 1)
Test Data:       (5384, 48, 48, 1)


In [9]:
# Save split data
np.savez_compressed(data_path+'data', 
                    train_x=train_x, train_t=train_t, 
                    val_x=val_x, val_t=val_t, 
                    test_x=test_x, test_t=test_t
)

## Data Augmentation

In [10]:
def augmentations(images, labels, model, iters=3):
    image_list = []
    cat_list = []

    bar = utils.ProgressBar(50, 'Generating Augmented Images: ')

    for i, (img, cat) in enumerate(zip(images, labels)):
        frac = (i + 1) / labels.shape[0]
        for a in range(iters):
            image_list.append(model(img))
            cat_list.append(cat)
        bar.update_display(frac, f'{frac:.2%}')

    return np.array(image_list), np.array(cat_list)

In [11]:
augment = keras.Sequential([
    keras.layers.RandomFlip('horizontal'), 
    keras.layers.RandomRotation(0.05), 
    keras.layers.RandomZoom(height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2)), 
    keras.layers.RandomContrast(factor=0.2)
])

n_augmentations = 5
train_x_aug, train_t_aug = augmentations(train_x, train_t, augment, n_augmentations)

np.savez_compressed(data_path+'data_aug', train_x=train_x_aug, train_t=train_t_aug)

2024-04-26 17:54:40.051617: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-26 17:54:43.297126: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 78911 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-80GB, pci bus id: 0000:4e:00.0, compute capability: 8.0


Generating Augmented Images: [##------------------------------------------------] 4.08%


KeyboardInterrupt

