In [None]:
import pandas as pd
import itertools
from scipy.special import comb
from collections import defaultdict

In [None]:
df_train = pd.read_csv('../input/shopee-train-with-objects/train_obj_07.csv')
# Since dataframe is recovered from file, lists have to be converted from strings to Python lists. 
# This step (with high cost) will not be necessary in the test eval, since objects are computed in same instance
features = ['class_index', 'confidence', 'area', 'coordinates']
new_cols = [f'objects_{feature}' for feature in features]
for col in new_cols:
    df_train[col] = df_train.apply(lambda row: eval(row[col]), axis=1)
df_train

In [None]:
total_rows = df_train.shape[0]
total_groups = df_train['label_group'].nunique()
n_elements_per_group = df_train['label_group'].value_counts()

Let's check how many rows do not contain detected objects

In [None]:
df_without_objects = df_train[df_train['objects_class_index'].map(lambda l: l == [])]
print(f'Rows without predicted objects {df_without_objects.shape[0]} out of {total_rows} ({round(df_without_objects.shape[0]/total_rows * 100, 2)}%)')

Read objects names from YOLO list to access name by index afterwards

In [None]:
objects_names = eval(open('../input/shopee-train-with-objects/objects_names.txt').read())

The following cell displays some general stats per object. We are interested in:
* High inverted ratio (1- original_ratio), that means that the object appears in few groups with a great number of rows
* High percentage of representation in groups, that means that the object appear in most of the elements inside a group

In [None]:
count_per_object = []

for object_index, object_name in enumerate(objects_names):
    
    df_with_object = df_train[df_train['objects_class_index'].map(lambda l: object_index in l)]
    count = df_with_object.shape[0]
    
    count_per_object.append(count)
    
    if count == 0:
        print(f'No occurences for object {object_name} ({object_index})')
        print()
        continue
        
    print(f'Stats for object {object_name} ({object_index})')
    
    print(f'\tAppears in {count} rows (out of {total_rows})')
    
    n_groups = df_with_object['label_group'].nunique()
    print(f'\tAppears in {n_groups} groups (out of {total_groups})')
    
    print(f'\tInverted Ratio: {round(1 - n_groups / count,2)}')
    
    n_elements_per_group_with_object = df_with_object['label_group'].value_counts()
    print(f'\tAppears in {round(n_elements_per_group_with_object.mean(), 2)} elements on average per group (min = {n_elements_per_group_with_object.min()}, max = {n_elements_per_group_with_object.max()})')
    
    # Compute stats of completeness per group
    percentages = [round(value / n_elements_per_group[label_id] * 100, 2) for label_id, value in n_elements_per_group_with_object.items()]
    print(f'\tThese elements represent {round(sum(percentages)/len(percentages),2)}% of their group on average (min = {min(percentages)}%, max = {max(percentages)}%)')
    
    confidence_avg = df_with_object.apply(lambda row: row['objects_confidence'][row['objects_class_index'].index(object_index)], axis=1).mean()
    print(f'\tConfidence average: {confidence_avg}')
    print()

To speed up the following computation and since we are only interested in the appearence or not of the object, let's create a set from the list of detected objects

In [None]:
df_train['objects_class_index_set'] = df_train.apply(lambda row: set(row['objects_class_index']), axis=1)

The following function obtains stats related to objects appearing in the same elements. Also, when this combination represents a high % of the total elements with these objects, info is printed. That is, if the combination between spoon and fork represents more than 5% of the objects with a spoon and more than 5% of the objects with a fork, info is printed (the 5% is arbitrary)

In [None]:
def group_by_common_objects(n_common_objects: int, min_percentage: float = 5):
    count = 0
    average = 0
    n_min = float('inf')
    n_max = 0
    for current_objects in itertools.combinations(range(len(objects_names)), n_common_objects):
        current_objects = set(current_objects)
        df_with_current_objects = df_train[df_train['objects_class_index_set'].map(lambda element_objects: current_objects <= element_objects)]
        n_matches = df_with_current_objects.shape[0]
        
        if n_matches > 1:
            count += 1
            average = ((count-1)*average + n_matches)/ count  # Incremental average
            n_min = min(n_min, n_matches)
            n_max = max(n_max, n_matches)
            
            current_objects = sorted(list(current_objects))
            percentages = [round(n_matches / count_per_object[object_index] * 100, 2) for object_index in current_objects]
            
            if all(p > min_percentage for p in percentages):
                print('Stats for objects:', current_objects, f'N_MATCHES: {n_matches}')
                for object_index, percentage in zip(current_objects, percentages):
                    print(f'\t{objects_names[object_index]} ({object_index}): ORIGINAL_MATCHES {count_per_object[object_index]} - {percentage}%')
                print()
            
    possible_pairs = int(comb(len(objects_names), n_common_objects))
    print('*'*50)
    print(f'Total pairs with matches: {count} out of {possible_pairs} ({round(count/possible_pairs * 100, 2)}%)')
    print(f'Number of matches stats: Average: {round(average, 2)} Min: {n_min} Max: {n_max}')

In [None]:
#%%time  
# 28.3s
group_by_common_objects(2)

In [None]:
#%%time  # 12 min 3s
#group_by_common_objects(3)
# TODO: function to re-use already computed work with 2 common_objects (only try triples with pairs that have matches)

In [None]:
#%%time
#group_by_common_objects(4)

The following function shows info about combinations of objects that appear alone in the set of detected objects for a group. Thanks to this insight and the information obtained from the previous function, we may be able to understand better what combinations could be determinant and what combinations could be joined to create supergroups

In [None]:
def objects_in_groups(min_objects: int, max_objects: int, min_freq: int = 10):
    objects_in_same_group_freq = defaultdict(lambda: 0)
    objects_per_label = df_train.groupby('label_group')['objects_class_index_set'].apply(lambda objects: set().union(*objects))
    
    for objects in objects_per_label:
        if min_objects <= len(objects) <= max_objects:
            objects_in_same_group_freq[tuple(sorted(objects))] += 1
            
    for objects in sorted(objects_in_same_group_freq, key=objects_in_same_group_freq.get, reverse=True):
        if objects_in_same_group_freq[objects] >= min_freq:
            for object_index in objects:
                print(f'{objects_names[object_index]} ({object_index})', end=' ')
            print('appear in ', objects_in_same_group_freq[objects], 'different groups')

In [None]:
objects_in_groups(min_objects=2, max_objects=3)