In [None]:
import time
from tqdm import tqdm
import pandas as pd
import timeit #Для проверки времени работы программы
from multiprocessing.dummy import Pool
from PIL import Image
import matplotlib.pyplot as plt
import collections
import os
import ast
from pandas.core.common import flatten
from more_itertools import sliced
import numpy as np
import urllib.request

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# All posts labeling

After we manually marked up users images merges from different parts of their profile, we created `labeled_merges_upd.csv` file with the user_ids and labels.

File `posts_for_merge.csv` is a main file, where all posts with images urls are presented (it was prepared in `selected_merges_creation.ipynb` notebook).

This notebook contains code with the automatic posts labeling, based on the manually retrieved category labels for selected user image merges.

In [None]:
merges_with_labeling = pd.read_csv('labeled_merges_upd.csv', sep=';')
posts_for_labeling = pd.read_csv('posts_for_merge.csv')

In [None]:
# Add user id column to the dataframe
merges_with_labeling['user_id'] = [int(x.split('_')[0]) for x in merges_with_labeling['merge_name'].values]
merges_with_labeling['merge_id'] = [int(x.split('merge')[1].split('.')[0]) - 1 for x in merges_with_labeling['merge_name'].values]

In the `labeled_merges_upd` dataset, there are three labels for each user's photo merges: 
* for the first merge from the downloaded period
* for the central merge from the period
* for the last merge. 

This is done in order to track whether the quality of content in the user's profile has changed during the selected download period.

If two consecutive labels for one user are the same, we will assume that the quality of this user's content did not change during this period, and all other photo merges between them will also be assigned this label.

In [None]:
# Labels assignment for the other photo merges according to the hypothesis above
def get_user_df(user_id):
    return posts_for_labeling[posts_for_labeling['id'] == user_id].reset_index(drop=True)

def get_labels_for_user(user_id):
    return merges_with_labeling[merges_with_labeling['user_id'] == user_id]['lbl'].values, merges_with_labeling[merges_with_labeling['user_id'] == user_id]['merge_id'].values

def create_merges(user_id, lbls_lst, merge_id_start=0, merge_id_end=2, only_borders=False):
    user_names = []
    merge_names = []
    images_to_merge = []
    merge_labels = []

    user_df = get_user_df(user_id)
    labeled_merges_ids = [0, int((len(user_df) - 9) // 2), len(user_df) - 9]

    post_id_start = labeled_merges_ids[merge_id_start] # post_id_start included
    post_id_end = labeled_merges_ids[merge_id_end] + 1 # post_id_end included

    if only_borders:
        for lbl_id in range(3):
            start_id = labeled_merges_ids[lbl_id]
            user_imgs_to_merge = []
            for i in range(9):
                user_imgs_to_merge.append(user_df.im_url[start_id + i])
            merge_labels.append(lbls_lst[lbl_id])
            images_to_merge.append(user_imgs_to_merge)
            merge_names.append(str(user_id) + '_merge' + str(start_id + 1))
            user_names.append(user_id)

        return merge_labels, merge_names, images_to_merge, user_names
    
    for start_id in range(post_id_start, post_id_end):
        # Shift on one photo
        user_imgs_to_merge = []
        # Create photo merge
        for i in range(9):
            user_imgs_to_merge.append(user_df.im_url[start_id + i])
        # Append created merge (as a set of 9 links to the images in this merge) 
        # to the all merges list for given user
        images_to_merge.append(user_imgs_to_merge)
        # Append merge name for created merge
        merge_names.append(str(user_id) + '_merge' + str(start_id + 1))
        user_names.append(user_id)
        merge_labels.append(lbls_lst[0])
    return merge_labels, merge_names, images_to_merge, user_names

def create_labeled_merges(user_ids):
    all_users_ids = []
    all_merges_imgs = []
    all_merges_names = []
    all_merges_labels = []

    for user_id in user_ids:
        merge_names = []
        merges_imgs = []
        users_ids = []
        merge_labels = []

        lbl_vals, merges_ids = get_labels_for_user(user_id)
        if (len(lbl_vals) == 2) and (merges_ids[1] - merges_ids[0] == 1) and (lbl_vals[0] == lbl_vals[1]):
            merge_labels, merge_names, merges_imgs, users_ids = create_merges(user_id, [lbl_vals[0]], merges_ids[0], merges_ids[1])

        if len(lbl_vals) == 3:
            if (lbl_vals[0] == lbl_vals[1]) and (lbl_vals[1] == lbl_vals[2]):
                merge_labels, merge_names, merges_imgs, users_ids = create_merges(user_id, [lbl_vals[0]], 0, 2)
            elif lbl_vals[0] == lbl_vals[1]:
                merge_labels, merge_names, merges_imgs, users_ids = create_merges(user_id, [lbl_vals[0]], 0, 1)
            elif lbl_vals[1] == lbl_vals[2]:
                merge_labels, merge_names, merges_imgs, users_ids = create_merges(user_id, [lbl_vals[1]], 1, 2)
            else:
                merge_labels, merge_names, merges_imgs, users_ids = create_merges(user_id, [lbl_vals[0], lbl_vals[1], lbl_vals[2]], 0, 2, only_borders=True)
    
        all_users_ids.extend(users_ids)
        all_merges_imgs.extend(merges_imgs)
        all_merges_names.extend(merge_names)
        all_merges_labels.extend(merge_labels)

    return all_merges_labels, all_merges_names, all_merges_imgs, all_users_ids

In [None]:
all_merges_labels, all_merges_names, all_merges_imgs, all_users_ids = create_labeled_merges(list(set(merges_with_labeling['user_id'].values)))

In [None]:
all_merges_dict = {'merge_name':all_merges_names, 'merge_lbl':all_merges_labels, 'user_id':all_users_ids, 'images':all_merges_imgs}
all_merges_df = pd.DataFrame.from_dict(all_merges_dict)

In [None]:
# save labeled lists for images uploading to csv file
all_merges_df.to_csv('all_labeled_merges.csv')