In [4]:
import pandas as pd
import numpy as np
import random
import os

# Final images data creation

This notebook contains two parts:
1. Uploading all users images by labeled classes
2. Collecting information about uploaded images in dataframe

In the first part code for images uploading and storing by users and prepared category label is provided.

In the second part final function for merges creation from these images is provided. This data formed the training and validation sets.

## Uploading all users images by labeled classes

In [None]:
merges = pd.read_csv('all_labeled_merges.csv')
merges = merges[['merge_name', 'user_id', 'images', 'class_lbl']]

# Fix arrays after uploading from csv (convert them from strings)
merges['images'] = list(map(ast.literal_eval, merges['images'].values))

In [None]:
# expand the merges lists of images urls into one list of all images urls when we load images in the first step
merge_list_list = list(merges.merge_name)
images_list = list(merges.images)
image_download_list = []
for i in range(len(merge_list_list)):
    k = 0
    for j in images_list[i]:
        new_name = merge_list_list[i] + '__%i' %k
        image_download_list.append([merge_list_list[i], new_name, j])
        k = k + 1

In [None]:
df_images = pd.DataFrame(image_download_list,columns=['merge_name','image_name','im_url', 'user_id', 'class_lbl'])

In [None]:
from more_itertools import sliced

In [None]:
images = list(zip(df_images.image_name, df_images.im_url, df_images.class_lbl, df_images.user_id))
group_len = 115
images_groups = list(sliced(images, group_len)) # for multiprocessing

In [None]:
# helper function for images downloading
def images_downloader(posts, main_path):
    for i in range(len(posts[:])): 
        # In the case of the second stage of loading, when images are downloaded to class folders and users, 
        # the parameter 'posts' length should be 4 and contain the class_lbl and user_id lists
        if len(posts) == 4:
            main_path = main_path + '/' + posts[i][2] + '/' + posts[i][3] + '/'
        try:
            urllib.request.urlretrieve(posts[i][1], myPath + posts[i][0])
        except:
            None
            
# main function for parallel downloading images by prepared urls
def parallel_downloading(images_groupes, it=0, step=10, n_processes=10)
    start = timeit.default_timer()

    while it <= len(images_groups1) - step:
        print(it)
        if __name__ == '__main__':
            pool = Pool(n_processes) # set processes number    
            result = pool.map(images_downloader, images_groups[it : it + step])
            pool.close()
            pool.join()
        it = it + step
        print('time - %f' %(timeit.default_timer() - start))
        
    # download last portion of data
    if __name__ == '__main__':
        pool = Pool(n_processes) # set processes number     
        result = pool.map(images_downloader, images_groups[it:])
        pool.close()
        pool.join()

In [None]:
%%time
# start downloading images to the <images_data_path> directory by classes by user ids
parallel_downloading(images_groups, '<final_images_data_path>')

## Collecting information about uploaded images in dataframe

**Initial state:** all images should be placed in the folders with their user id, which should be placed in folders with the class label to which they belong.

**Example of folder structure:**

* uploaded_images
    * brand
        * Potapova.a
            * image1
            * image2
            * ...
    * food
        * best_kitchen_ever
            * image1
            * image2
            * ...

The main directory `uploaded_images` is passed as the input of the`create_csv_for_uploaded_images` function.

Below is a list of all classes. All classes subfolders in the main directory should have these names too.

In [5]:
# all 9 classes labels
lbl_classes = ['brand', 'lifestyle', 'thematic', 'food', 
               'bad_brand', 'bad_lifestyle', 'bad_thematic', 'bad_food', 'bad_beauty_services']

In [6]:
# Function for collecting information about uploaded images to a DataFrame
# It returns a dataframe that lists information of all images from all classes in a provided main directory
# second and further carousel images will not be placed in the result dataframe
def create_csv_for_uploaded_images(imgs_directory):
    all_imgs_df = pd.DataFrame(columns=['image_name', 'username', 'class', 'is_carousel', 'carousel_num'])
    for lbl_class in lbl_classes:
        class_posts = os.listdir(imgs_directory + '/' + lbl_class)
        # collect info about all all images names and user
        users = []
        images = []
        for i in class_posts:
            images = images + os.listdir(imgs_directory + '/' + lbl_class + '/%s' %i)
            users = users + [i] * len(os.listdir(imgs_directory + '/' + lbl_class + '/%s' %i))
            
        df = pd.DataFrame(images, columns=['image_name'])
        df['username'] = users
        df['class'] = lbl_class
        
        # check if the image is second or further in the carousel
        df.loc[df.image_name.str.count('(\_[0-9]{1}\.[jpg]{3})')>0,'is_carousel'] = 1
        df.loc[df.is_carousel.isnull(),'is_carousel'] = 0
        
        df.loc[df.is_carousel==1,'carousel_num'] = df[df.is_carousel==1]['image_name'].apply(lambda x: int(x.split('_')[-1].replace('.jpg','')))
        
        df['carousel_num'] = df['carousel_num'].fillna(-1)
        
        # save only first carousel images or images from posts without carousel
        df_final = df[df.carousel_num <= 1].reset_index(drop=True)
        all_imgs_df = pd.concat([all_imgs_df, df_final], ignore_index=True)
        
    return all_imgs_df        

In [8]:
df = create_csv_for_uploaded_images('<final_images_data_path>')
df.to_csv('final_images_processed.csv', index=False)

## Merges creating for the collected images

Cases when images are not square are also handled

In [166]:
posts_df = pd.read_csv('final_images_processed.csv')

In [133]:
from PIL import Image

In [145]:
def crop_image(img):
    # for cases when images are not squared we crop them
    crop_area1 = (0, (img.height - img.width) // 2, img.width, img.height - (img.height - img.width) // 2)
    crop_area2 = (-(img.height - img.width) // 2, 0, img.width + (img.height - img.width) // 2, img.height)
    
    if img.height > img.width:
        img = img.crop(crop_area1)
    elif img.height < img.width:
        img = img.crop(crop_area2)
    return img

# function for 9 images concatenation in the form of 3x3 square
def concat_imgs(imgs_lst):
    img_h = imgs_lst[0].height
    img_w = imgs_lst[0].width
    dst = Image.new('RGB', (3 * img_w, 3 * img_h))
    for i in range(3):
        for j in range(3):
            dst.paste(imgs_lst[i * 3 + j], (img_w * j, img_h * i))
    return dst

# main fuction for images merging
def merge_images(parameters, class_dir, merges_name):
    uploaded_imgs = []
    uploaded_fl = 0
    for img_name in parameters:
        try:
            img = Image.open('<uploaded_images_by_classes_path>/' + class_dir + img_name)
        except(FileNotFoundError):
            uploaded_fl = -1
            break
        img = crop_image(img)
        img = img.resize((150, 150))
        uploaded_imgs.append(img)
    if uploaded_fl == 0:
        image = concat_imgs(uploaded_imgs)
        image.save('<final_merges_directory_path>/' + class_dir.split('/')[0] + '/' + merges_name + '.jpg')
        return 0
    else:
        return 1

In [111]:
batch_size = 20 # we choose 9 random images for each 20 sequential images
n_total = 500 # maximum total number of final merges for 1 user

In [None]:
# Function for merges creation (from prepared uploaded images, using prepared csv with info from above) 
# and saving merges in the main directory by classes labels
for user_id in list(set(posts_df.username.values)):
    all_user_images = posts_df[posts_df['username'] == user_id].image_name.values
    class_lbl = posts_df[posts_df['username'] == user_id]['class'].values[0]
    class_dir = class_lbl + '/' + user_id + '/'
    
    n_batches = int(np.ceil(len(all_user_images) / batch_size))
    n_per_batch = max(9, int(np.floor(n_total / n_batches))) # fix situation, when n_per_batch is less than 9
    
    for start_id in range(n_batches):
        images = all_user_images[start_id : start_id + n_per_batch]
        it = 0
        n_tries = 0 # for situations, when it is not possible to create merge we try with different image names for 5 times 
        # and after that move to the next portion of images
        while (it < n_per_batch) and (n_tries <= 5):
            random.shuffle(images)
            images_new = images[:9]
            if merge_images(images_new, class_dir,
                            str(user_id) + '_merge_' + str(start_id * n_per_batch + it + 1)):
                n_tries += 1
                if n_tries == 5:
                    continue
            it += 1

As a result we receive final data with the image merges for all nine classed. This data was used by us to train and validate the neural network model. 