In [25]:
import os
from PIL import Image
import cv2
import numpy as np
import pandas as pd
import pickle
from tensorflow import keras
import re
from pathlib import Path
import pickle as pkl

In [26]:
# opencv reads grayscale images as 3 channels and copies the first layer twice
def is_grayscale(r, g, b):
    return b.all() == g.all() and b.all() == r.all()

In [27]:
def parse_visual_genome(avg_df):
    # read in all images and calculate the average RGB
    data_path = 'data/images/'

    for image in os.listdir(data_path):
            full_path = data_path + image
            img = cv2.imread(full_path)
            if img is not None:
                b,g,r = cv2.split(img)
                if not is_grayscale(r, g, b):
                    avg_dict = {'Filename':image, 'R Average':np.mean(r), 'G Average':np.mean(g), 'B Average':np.mean(b)}
                    avg_df = avg_df.append(avg_dict, ignore_index=True)

    return avg_df

In [28]:
def parse_cifar_100(avg_df, images, file_prefix, ext):
    # data_size = images.shape[0]
    img_idx = 0
    
    for image in images:
        # get file name
        filename = file_prefix+str(img_idx)+ext
        # save cifar img
        pil_img = Image.fromarray(np.uint8(image))
        pil_img.save('data/images/'+filename)
        
        # obtain channels
        r,g,b = pil_img.split()
        
        # calculate and save avgs
        avg_dict = {'Filename':filename, 'R Average':np.mean(r), 'G Average':np.mean(g), 'B Average':np.mean(b)}
        
        # append to df
        avg_df = avg_df.append(avg_dict, ignore_index=True)
        
        # move to next file
        img_idx += 1
        
    return avg_df

In [29]:
def compile_img_database():
    # regex = re.compile(r'^([^.]+)')
    img_df = pd.DataFrame(columns=['Filename','R', 'G','B'])
    
    for file in avg_df['Filename']:
        image = cv2.imread('data/images/'+file)
        b,g,r = cv2.split(image)
        img_dict = {'Filename':file, 'R':r.astype(np.uint8), 'G':g.astype(np.uint8), 'B':b.astype(np.uint8)}
        img_df = img_df.append(img_dict, ignore_index=True)
    
    img_df.to_pickle('data/img_database.pkl')
    # img_df.to_csv('data/img_database.csv', index=False)

In [30]:
def compile_avg_database(avg_df):
    # create the dataframe
    avg_df = pd.DataFrame(columns=['Filename','R Average', 'G Average','B Average'])
    
    # parse the VG data
    avg_df = parse_visual_genome(avg_df)
    
    # parse the CIFAR data
    (x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
    avg_df = parse_cifar_100(avg_df, x_train, 'cifar_train_', '.jpeg')
    avg_df = parse_cifar_100(avg_df, x_test, 'cifar_test_', '.jpeg')
    
    avg_df.to_csv('data/avg_database.csv', index=False)

In [None]:
compile_avg_database(avg_df)
compile_img_database()

In [None]:
df = pd.read_pickle('data/img_database.pkl')
for row in df.values:
    name = row[0]
    r = row[1]
    g = row[2]
    b = row[3]
    image = np.dstack([r,g,b])
    Image.fromarray(image.astype(np.uint8)).show()
    break

In [None]:
img_df

In [124]:
print(len(img_df) == len())

True


In [166]:
# def compile_img_database():
#     regex = re.compile(r'^([^.]+)')
#     files = 0
#     avg_df = pd.read_csv('data/avg_database.csv')
#     for file in avg_df['Filename']:
#         image = cv2.imread('data/images/'+file)
#         b,g,r = cv2.split(image)
#         name = regex.findall(file)[0]
#         name = name + '.pkl'
        
#         with open('data/numpy_pickles/'+ name,'wb') as f:
#             pkl.dump(r, f)
#             pkl.dump(g, f)
#             pkl.dump(b, f)
#             files += 1
#     print(files)

In [169]:
def compile_img_database():
    regex = re.compile(r'^([^.]+)')
    files = 0
    avg_df = pd.read_csv('data/avg_database.csv')
    for file in avg_df['Filename']:
        # image = cv2.imread('data/images/'+file)
        Image.open('data/images/'+file).save('data/numpy_images/'+file)
    print(files)

In [170]:
compile_img_database()

0


In [171]:
print(len(os.listdir('data/numpy_pickles/')))
print(len(os.listdir('data/numpy_images/')))
print(len(avg_df['Filename']))

67685
67685
94233


In [154]:
a = np.array([[1,2,3],[4,5,6]])
b = np.array([[7,8,9],[10,11,12]])
with open('test.pkl','wb') as f:
    pkl.dump(a, f)
    pkl.dump(b, f)

In [156]:
with open('test.pkl','rb') as f:
    x = pickle.load(f)
    print(x)
    y = pickle.load(f)
    print(y)

[[1 2 3]
 [4 5 6]]
[[ 7  8  9]
 [10 11 12]]


In [190]:
img_df.to_pickle("img_df.pkl")

In [191]:
r = pd.read_pickle("img_df.pkl")
r

Unnamed: 0,Filename,R Average,G Average,B Average
0,cifar_train_43545.jpeg,"[[92, 99, 102, 103, 114, 123, 123, 114, 134, 1...","[[103, 107, 106, 103, 107, 112, 108, 96, 115, ...","[[87, 92, 92, 93, 99, 106, 103, 92, 111, 78, 9..."
1,cifar_train_26037.jpeg,"[[255, 222, 232, 236, 235, 255, 255, 232, 242,...","[[240, 206, 216, 219, 218, 240, 235, 212, 224,...","[[189, 155, 164, 167, 164, 187, 184, 162, 176,..."
2,2333608.jpg,"[[25, 23, 56, 31, 44, 47, 70, 61, 73, 91, 100,...","[[23, 33, 79, 59, 71, 74, 102, 100, 109, 121, ...","[[8, 34, 97, 83, 90, 91, 125, 131, 143, 149, 1..."
3,cifar_test_8149.jpeg,"[[84, 82, 60, 60, 87, 83, 78, 104, 133, 109, 8...","[[91, 90, 71, 74, 104, 103, 99, 127, 154, 127,...","[[107, 101, 77, 74, 96, 91, 84, 111, 139, 111,..."
4,cifar_train_5736.jpeg,"[[124, 92, 97, 98, 102, 106, 92, 112, 132, 99,...","[[134, 102, 104, 104, 108, 109, 94, 112, 132, ...","[[63, 31, 34, 34, 38, 40, 28, 48, 70, 40, 25, ..."
...,...,...,...,...
94228,cifar_test_9995.jpeg,"[[133, 108, 94, 80, 103, 172, 132, 126, 139, 1...","[[187, 167, 158, 143, 160, 219, 153, 94, 41, 4...","[[93, 75, 71, 64, 91, 164, 114, 79, 54, 75, 13..."
94229,cifar_test_9996.jpeg,"[[110, 117, 132, 137, 129, 128, 137, 140, 128,...","[[124, 131, 146, 153, 148, 149, 160, 168, 158,...","[[65, 70, 85, 90, 82, 80, 88, 94, 84, 77, 71, ..."
94230,cifar_test_9997.jpeg,"[[24, 27, 19, 13, 16, 13, 9, 15, 16, 30, 33, 1...","[[35, 39, 31, 25, 28, 24, 19, 23, 21, 33, 33, ...","[[37, 39, 31, 25, 28, 26, 21, 26, 27, 42, 45, ..."
94231,cifar_test_9998.jpeg,"[[80, 93, 19, 46, 82, 105, 62, 81, 57, 57, 51,...","[[87, 99, 21, 44, 76, 94, 47, 61, 36, 31, 26, ...","[[79, 89, 10, 31, 60, 74, 24, 36, 9, 4, 0, 62,..."


In [198]:
print(type(r['R Average'][0]))
print(len(r['R Average'][0]))
print(r['R Average'][0][25])

<class 'numpy.ndarray'>
32
[132  89  99 122 112  74  37  39  66  97  51  41  33  22  21  49  58   7
  31  37  28  39  28  38  36  46  52  57  75 106 129 138]
