Alex Thoennes  
May 30, 2022  
CS583 Final Project  

# Description

This file creates the average and image databases. These databases are written to csv and pickle files which are used to make this program more portable. Without those two files, all images would be required to run the mosaic program which would consume a lot of disk space. If you wish to create the csv and pickle file, then download the Visual Genome dataset parts 1 and 2 [here](https://visualgenome.org/api/v0/api_home.html) and extract them to `data/images/`. This program automatically downloads the cifar100 data.

# Imports

In [1]:
import os
from PIL import Image
import cv2
import numpy as np
import pandas as pd
from tensorflow import keras
import re
from pathlib import Path
import pickle as pkl

2022-06-03 14:15:50.170068: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


# Functions

In [2]:
"""
This function determiens if an image is a grayscale image.
opencv reads grayscale images as 3 channels and copies the first layer twice.
"""
def is_grayscale(r, g, b):
    return b.all() == g.all() and b.all() == r.all()

In [3]:
"""
Parses the visual genome dataset and adds each image's filename
and average RGB values to a dataframe which is returned
"""
def parse_visual_genome(avg_df):
    # all images should be extracted to this path
    data_path = 'data/images/'

    # iterate over all images, read them in and split them,
    # then calulate their average RGB and add the data to the dataframe
    for image in os.listdir(data_path):
            full_path = data_path + image
            img = cv2.imread(full_path)
            
            if img is not None:
                b,g,r = cv2.split(img)
                if not is_grayscale(r, g, b):
                    avg_dict = {'Filename':image, 'R Average':np.mean(r), 'G Average':np.mean(g), 'B Average':np.mean(b)}
                    avg_df = avg_df.append(avg_dict, ignore_index=True)

    return avg_df

In [4]:
"""
Parses the cifar100 dataset and adds each image's filename
and average RGB values to a dataframe which is returned
"""
def parse_cifar_100(avg_df, images, file_prefix, ext):
    # this is used to label the images in both the train test splits
    img_idx = 0
    
    for image in images:
        # get file name
        filename = file_prefix+str(img_idx)+ext
        # save cifar img
        pil_img = Image.fromarray(np.uint8(image))
        
        # obtain channels
        r,g,b = pil_img.split()
        
        # calculate and save avgs
        avg_dict = {'Filename':filename, 'R Average':np.mean(r), 'G Average':np.mean(g), 'B Average':np.mean(b)}
        
        # append to df
        avg_df = avg_df.append(avg_dict, ignore_index=True)
        
        # move to next file
        img_idx += 1
        
    return avg_df

In [5]:
"""
takes all images and writes them to a pickle file for portability
"""
def compile_img_database(avg_df):
    img_df = pd.DataFrame(columns=['Filename','R', 'G','B'])
    
    for file in avg_df['Filename']:
        image = cv2.imread('data/images/'+file)
        b,g,r = cv2.split(image)
        img_dict = {'Filename':file, 'R':r.astype(np.uint8), 'G':g.astype(np.uint8), 'B':b.astype(np.uint8)}
        img_df = img_df.append(img_dict, ignore_index=True)
    
    img_df.to_pickle('data/img_database.pkl')

In [6]:
"""
takes the database of RGB averages and writes it to a csv file for portability
"""
def compile_avg_database(avg_df):
    # create the dataframe
    avg_df = pd.DataFrame(columns=['Filename','R Average', 'G Average','B Average'])
    
    # parse the VG data
    avg_df = parse_visual_genome(avg_df)
    
    # parse the CIFAR data
    (x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
    avg_df = parse_cifar_100(avg_df, x_train, 'cifar_train_', '.jpeg')
    avg_df = parse_cifar_100(avg_df, x_test, 'cifar_test_', '.jpeg')
    
    avg_df.to_csv('data/avg_database.csv', index=False)
    
    return avg_df

# Database Creations

In [12]:
avg_df = pd.DataFrame(columns=['Filename', 'R Average', 'G Average', 'B Average'])

# create the database of average RGB values
avg_df = compile_avg_database(avg_df)
avg_df = pd.read_csv('data/avg_database.csv')

# create the database of raw image data
compile_img_database(avg_df)