In [2]:
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-train-vrd-labels.csv -P vrd/train
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-train-vrd-bbox.csv -P vrd/train
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-train-vrd.csv -P vrd/train

# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-validation-vrd-labels.csv -P vrd/valid
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-validation-vrd-bbox.csv -P vrd/valid
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-validation-vrd.csv -P vrd/valid
    
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-classes-vrd.csv -P vrd/meta
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-attributes-description.csv -P vrd/meta
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-relationships-description.csv -P vrd/meta
# !wget https://storage.googleapis.com/openimages/challenge_2019/challenge-2019-relationship-triplets.csv -P vrd/meta

In [10]:
# !sudo $(which pip) install lightgbm

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gc
import lightgbm as lgb
gc.collect()

4

In [2]:
train_bbox_dir = "vrd/train/challenge-2019-train-vrd-bbox.csv"
train_vrd_dir = "vrd/train/challenge-2019-train-vrd.csv"
valid_bbox_dir = "vrd/valid/challenge-2019-validation-vrd-bbox.csv"
valid_vrd_dir = "vrd/valid/challenge-2019-validation-vrd.csv"

In [3]:
sub_dir = "scripts/submission_levelled.csv"

In [4]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

def create_unique_id(df):
    return df.ImageID.astype(str) + '_' + df.LabelName1.astype(str) + '_' + df.LabelName2.astype(str) + '_' + df.XMin1.astype(str) + '_' + df.XMax1.astype(str) + '_' + df.YMin1.astype(str) + '_' + df.YMax1.astype(str) + '_' + df.XMin2.astype(str) + '_' + df.XMax2.astype(str) + '_' + df.YMin2.astype(str) + '_' + df.YMax2.astype(str)

def create_unique_is_id(df):
    return df.ImageID.astype(str) + '_' + df.LabelName1.astype(str) + '_' + df.XMin1.astype(str) + '_' + df.XMax1.astype(str) + '_' + df.YMin1.astype(str) + '_' + df.YMax1.astype(str) + '_' + df.XMin2.astype(str) + '_' + df.XMax2.astype(str) + '_' + df.YMin2.astype(str) + '_' + df.YMax2.astype(str)

def get_unique_id(row):
    return str(row['ImageID'])+ '_' + str(row['LabelName1']) + '_' + str(row['LabelName2']) + '_' + str(row['XMin1']) + '_' + str(row['XMax1'])+ '_' + str(row['YMin1'])+ '_' + str(row['YMax1']) + '_' + str(row['XMin2'])+ '_' + str(row['XMax2']) + '_' + str(row['YMin2']) + '_' + str(row['YMax2'])

def rel_match(row, df):
    try:
        return df.loc[row.name]
    except:
        return None 
    
def get_dfs(bbox_dir, vrd_dir):
    df_train_bbox = pd.read_csv(bbox_dir)
    df_train_bbox, NAlist = reduce_mem_usage(df_train_bbox)

    df_train_vrd = pd.read_csv(vrd_dir)
    df_train_vrd, NAlist = reduce_mem_usage(df_train_vrd)

    df_train_vrd_is = df_train_vrd[df_train_vrd["RelationshipLabel"]=="is"]

    df_train_vrd_rel = df_train_vrd[df_train_vrd["RelationshipLabel"]!="is"]

    return df_train_bbox, df_train_vrd_is, df_train_vrd_rel

def get_is_df(df_bbox, df_possible_is_triplets):
    list_dict_rel_is = []
    list_of_possible_is = list(df_possible_is_triplets["LabelName1"].values)
    print("Total 'Is' Iterations:",len(df_bbox))
    for index, row in tqdm(df_bbox.iterrows()):
        if row["LabelName"] in list_of_possible_is:
            a = {'ImageID': row["ImageID"],
            'LabelName1': row["LabelName"],
            'LabelName2': "",
            'RelationshipLabel': 'is',
            "Confidence1": row["Confidence"],
            "Confidence2": row["Confidence"],
            'XMax1': row["XMax"],
            'XMax2': row["XMax"],
            'XMin1': row["XMin"],
            'XMin2': row["XMin"],
            'YMax1': row["YMax"],
            'YMax2': row["YMax"],
            'YMin1': row["YMin"],
            'YMin2': row["YMin"]
            }
            list_dict_rel_is.append(a)
    df_is  = pd.DataFrame(list_dict_rel_is)
    return df_is

def get_rel_df(df_bbox, possible_rel_tuples):
    df_grp = df_bbox.groupby("ImageID")
    list_dict_rel_other = []

    for key, grp in tqdm(df_grp):
        list_rec = grp.to_dict('records')
        for index1, row1 in enumerate(list_rec):
            for index2, row2 in enumerate(list_rec):
                if index1 != index2 and (row1["LabelName"], row2["LabelName"]) in possible_rel_tuples:
                    a = {'ImageID': key,
                    'LabelName1': row1["LabelName"],
                    'LabelName2': row2["LabelName"],
                    'RelationshipLabel': "",
                    "Confidence1": row1["Confidence"],
                    "Confidence2": row2["Confidence"],
                    'XMax1': row1["XMax"],
                    'XMax2': row2["XMax"],
                    'XMin1': row1["XMin"],
                    'XMin2': row2["XMin"],
                    'YMax1': row1["YMax"],
                    'YMax2': row2["YMax"],
                    'YMin1': row1["YMin"],
                    'YMin2': row2["YMin"]
                    }
                    list_dict_rel_other.append(a)

    df_rel  = pd.DataFrame(list_dict_rel_other)
    return df_rel

In [5]:
df_train_bbox, df_train_vrd_is, df_train_vrd_rel = get_dfs(train_bbox_dir, train_vrd_dir)
df_valid_bbox, df_valid_vrd_is, df_valid_vrd_rel = get_dfs(valid_bbox_dir, valid_vrd_dir)

Memory usage of properties dataframe is : 175.70877075195312  MB
******************************
Column:  XMin
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  XMax
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  YMin
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  YMax
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  IsGroupOf
dtype before:  int64
dtype after:  int8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  103.54269981384277  MB
This is  58.92858926206552 % of the initial size
Memory usage of properties dataframe is : 34.31111145019531  MB
******************************
Column:  XMin1
dtype before:  float64
dtype after:  float32
******************************
*****************

In [6]:
df_possible_triplets = pd.read_csv("vrd/meta/challenge-2019-relationship-triplets.csv")
df_possible_is_triplets = df_possible_triplets[df_possible_triplets["RelationshipLabel"]=="is"]
df_possible_rel_triplets = df_possible_triplets[df_possible_triplets["RelationshipLabel"]!="is"]

In [7]:
possible_clss = list(df_possible_rel_triplets["LabelName1"].unique()) + list(df_possible_rel_triplets["LabelName2"].unique()) + list(df_possible_is_triplets["LabelName1"].unique())
possible_clss

['/m/01599',
 '/m/05r655',
 '/m/04yx4',
 '/m/01bl7v',
 '/m/01yrx',
 '/m/09tvcd',
 '/m/03bt1vf',
 '/m/0bt9lr',
 '/m/0cmx8',
 '/m/04ctx',
 '/m/04dr76w',
 '/m/02p5f1q',
 '/m/03qrc',
 '/m/02jvh9',
 '/m/01mzpv',
 '/m/0dt3t',
 '/m/01_5g',
 '/m/04bcr3',
 '/m/0h8my_4',
 '/m/03k3r',
 '/m/0l14j_',
 '/m/080hkjn',
 '/m/05_5p_0',
 '/m/01940j',
 '/m/0k4j',
 '/m/01226z',
 '/m/01y9k5',
 '/m/078n6m',
 '/m/05r5c',
 '/m/05ctyq',
 '/m/0wdt60w',
 '/m/01mzpv',
 '/m/0h2r6',
 '/m/04ctx',
 '/m/0dt3t',
 '/m/03m3pdh',
 '/m/050k8',
 '/m/0pg52',
 '/m/029bxz',
 '/m/026t6',
 '/m/0199g',
 '/m/0cvnqh',
 '/m/0bwd_0j',
 '/m/0584n8',
 '/m/03ssj5',
 '/m/02hj4',
 '/m/04_sv',
 '/m/0dv5r',
 '/m/02p5f1q',
 '/m/01_5g',
 '/m/02jvh9',
 '/m/019w40',
 '/m/09tvcd',
 '/m/01599',
 '/m/0dv9c',
 '/m/071p9',
 '/m/07y_7',
 '/m/0bt9lr',
 '/m/0342h',
 '/m/0cmx8',
 '/m/078jl',
 '/m/04dr76w',
 '/m/01f91_',
 '/m/01s55n',
 '/m/0hg7b',
 '/m/01yrx',
 '/m/06__v',
 '/m/03qrc',
 '/m/08pbxl',
 '/m/0fx9l',
 '/m/04dr76w',
 '/m/04bcr3',
 '/m/01_5g',
 '

In [8]:
len(df_possible_is_triplets), len(df_possible_rel_triplets)

(42, 287)

In [9]:
df_bbox_submission = pd.read_csv(sub_dir)

In [10]:
df_bbox_submission.head()

Unnamed: 0,ImageId,PredictionString
0,00000b4dcff7f799,/m/0dzf4 0.29 0.8515 0.29339 0.922970000000000...
1,00001a21632de752,/m/03q69 0.62 0.56335 0.58706 0.80463000000000...
2,0000d67245642c5f,/m/031n1 0.06 0.0 0.12647 0.71125 0.90622 /m/0...
3,0001244aa8ed3099,/m/035r7c 0.25 0.24611999999999998 0.06044 0.7...
4,000172d1dd1adce0,/m/04hgtk 0.12 0.29667 0.35318 0.31774 0.38799...


In [None]:
list_bboxes = []
for index, row in tqdm(df_bbox_submission.iterrows()):
    bboxes = []
    image_id = row["ImageId"]
    arrList = row["PredictionString"]
    predictList = arrList.split("/m/")[1:]
    for pred in predictList:
        predItem = pred.split(" ")
        clsId = "/m/"+predItem[0]
        confidence = float(predItem[1])
        XMin, YMin, XMax, YMax = [float(x) for x in predItem[2:6]]
        if clsId in possible_clss:
            dict_item = {
                "LabelName":clsId,
                "Confidence":confidence,
                "XMin":XMin,
                "YMin":YMin,
                "XMax":XMax,
                "YMax":YMax,
                "ImageID":image_id
            }
            list_bboxes.append(dict_item)

df_sub_fmt_bboxes = pd.DataFrame(list_bboxes)
possible_rel_tuples = [tuple(x) for x in df_possible_rel_triplets[["LabelName1","LabelName2"]].values]
df_test_rel = get_rel_df(df_sub_fmt_bboxes, possible_rel_tuples)
df_test_is = get_is_df(df_sub_fmt_bboxes, df_possible_is_triplets)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=95525), HTML(value='')))

In [None]:
cats = df_train_vrd_rel[['LabelName1','LabelName2']].stack().sort_values().unique()
print(cats)
rel_cats = df_train_vrd_rel[['RelationshipLabel']].stack().sort_values().unique()
print(rel_cats, len(rel_cats))

In [None]:
df_train_vrd_rel['LabelName1'] = pd.Categorical(df_train_vrd_rel['LabelName1'], categories=cats)
df_train_vrd_rel['LabelName2'] = pd.Categorical(df_train_vrd_rel['LabelName2'], categories=cats)
df_valid_vrd_rel['LabelName1'] = pd.Categorical(df_valid_vrd_rel['LabelName1'], categories=cats)
df_valid_vrd_rel['LabelName2'] = pd.Categorical(df_valid_vrd_rel['LabelName2'], categories=cats)
df_train_vrd_rel['RelationshipLabel'] = pd.Categorical(df_train_vrd_rel['RelationshipLabel'], categories=rel_cats)
df_valid_vrd_rel['RelationshipLabel'] = pd.Categorical(df_valid_vrd_rel['RelationshipLabel'], categories=rel_cats)
df_test_rel['LabelName1'] = pd.Categorical(df_test_rel['LabelName1'], categories=cats)
df_test_rel['LabelName2'] = pd.Categorical(df_test_rel['LabelName2'], categories=cats)

In [None]:
display(df_valid_vrd_rel.head())
display(df_train_vrd_rel.head())
display(df_test_rel.head())

In [None]:
df_train_vrd_rel.to_hdf('vrd/df_train_vrd_rel.h5', key='df_train_vrd_rel', mode='w', format='t')
df_valid_vrd_rel.to_hdf('vrd/df_valid_vrd_rel.h5', key='df_valid_vrd_rel', mode='w', format='t')
df_test_rel.to_hdf('vrd/df_test_rel.h5', key='df_test_rel', mode='w', format='t')

df_train_vrd_is.to_hdf('vrd/df_train_vrd_is.h5', key='df_train_vrd_is', mode='w', format='t')
df_valid_vrd_is.to_hdf('vrd/df_valid_vrd_is.h5', key='df_valid_vrd_is', mode='w', format='t')
df_test_is.to_hdf('vrd/df_test_is.h5', key='df_test_is', mode='w', format='t')