In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [8]:
import os
import cv2
import PIL
import numpy as np
from tqdm.notebook import tqdm
import random
from PIL import Image, ImageStat
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path
%matplotlib inline  

In [3]:
IMAGES_FOLDER  = os.path.join('/content/drive/My Drive/', 'megapolis_football', 'labeled_images')
INTERMEDIATE_FOLDER = os.path.join('/content/drive/My Drive/', 'megapolis_football', 'intermediate') 
CACHE_FILE = os.path.join(INTERMEDIATE_FOLDER, 'cache_array.npz') 
PROCESSED_IMAGES_FOLDER = os.path.join('/content/drive/My Drive/', 'megapolis_football', 'processed_images')

In [4]:
HW=(80,32)
WH=(32,80)
SAVE_PROCESSED_IMAGES = False # It may be useful when you want to look at processed (scaled) images.

In [5]:
data = []
for folder, _, files in tqdm(os.walk(IMAGES_FOLDER)):
    for file_name in files:
        rel_dir = os.path.relpath(folder, IMAGES_FOLDER)
        label = tuple(rel_dir.split(os.sep))
        file_path = os.path.join(folder, file_name)
        data.append((file_path, label))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [6]:
k = min(len(data), 30)  # There is no need to calculate mean and standard deviation over all samples, it's enough to take 30, for instance.
mean = []
std = []
for path, _ in tqdm(random.choices(data, k=k)):
    image_pil = Image.open(path)
    stat = ImageStat.Stat(image_pil)
    mean.append(stat.mean)
    std.append(stat.stddev)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [7]:
rgb_mean = np.mean(std, axis=0)
rgb_std = np.mean(mean, axis=0)

In [19]:
images = []
labels = []
paths = []
for path, label in tqdm(data):
     with Image.open(path) as image:
        blank_image = Image.new('RGB', WH, 'black')
        image.thumbnail(WH, Image.BICUBIC)
        blank_image.paste(image, (0,0)) # add to left upper corner
        image_pil_np = np.asarray(blank_image)
        images.append((image_pil_np - rgb_mean)/rgb_std) # Standardization
        labels.append(label)
        paths.append(path)
        if SAVE_PROCESSED_IMAGES:
            blank_image.save(os.path.join(PROCESSED_IMAGES_FOLDER, '_'.join(path.split(os.sep)[-3:])))

HBox(children=(FloatProgress(value=0.0, max=503.0), HTML(value='')))




In [20]:
labels = [str(l) for l in labels]

In [21]:
def eliminate_rare(images, labels, paths):
    rare_labels =[l for l, count in Counter(labels).items() if count < 2] 
    rare_indexes = [] # Indexes of samples, where labels are rather rare
    if any(rare_labels):
        for j, l in tqdm(enumerate(labels)):
            if l in rare_labels:
                rare_indexes.append(j)        
    images = np.stack([ _ for (i, _) in enumerate(images) if i not in rare_indexes])
    labels = np.stack([ _ for (i, _) in enumerate(labels) if i not in rare_indexes])
    paths = np.stack( [ _ for (i, _) in enumerate(paths)  if i not in rare_indexes])
    return images, labels, paths



In [22]:
images, labels, paths = eliminate_rare(images, labels, paths)

In [23]:
assert images.shape[0] == labels.shape[0] == paths.shape[0]

In [24]:
np.savez(CACHE_FILE, images=images, labels=labels, paths=paths, rgb_mean=rgb_mean, rgb_std=rgb_std)

In [None]:
np.load(CACHE_FILE)['images'].shape

(503, 80, 32, 3)