In [None]:
from collections import defaultdict
import cv2
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

import config

In [None]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)
    
from IPython.display import display, HTML

CSS_Flex = """
.output {
    flex-direction: row;
}
"""

CSS_Orig = """
.output {
    flex-direction: column;
}
"""
def row_wise():
    HTML('<style>{}</style>'.format(CSS_Flex))
    
def column_wise():
    HTML('<style>{}</style>'.format(CSS_Orig))

# Import Tasks and Label Data

In [None]:
tasks_json = json.load(open(config.TASK_MAP_FILE))
tasks = defaultdict(dict)
task_attr_sets = defaultdict(set)
task_label_sets = defaultdict(set)
id_to_task_dict = {}
for task in tasks_json['taskInfo']:
    obj, attr = task['taskName'].split(':')
    id_to_task_dict[task['taskId']] = task['taskName']
    tasks[obj][attr] = task['taskId']
    task_attr_sets[obj].add(attr)
    task_label_sets[obj].add(task['taskId'])

In [None]:
labels_json = json.load(open(config.LABEL_MAP_FILE))
label_id_to_label_map = {}
for label in labels_json['labelInfo']:
    label_id_to_label_map[label['labelId']] = label['labelName']

In [None]:
objects = list(tasks.keys())
attr_rows = []
label_rows = []
for i in objects:
    row_attr = []
    row_label = []
    for j in objects:
        row_attr.append(len(task_attr_sets[i].intersection(task_attr_sets[j])))
        row_label.append(len(task_label_sets[i].intersection(task_label_sets[j])))
    attr_rows.append(row_attr)
    label_rows.append(row_label)
df_attr_intersection = pd.DataFrame(attr_rows, index=objects, columns=objects)
df_id_intersection = pd.DataFrame(label_rows, index=objects, columns=objects)

In [None]:
task_id_to_object_map = {}
for _ in tasks_json['taskInfo']:
    task_id_to_object_map[_['taskId']] = _['taskName'].split(":")[0].strip()

In [None]:
task_id_to_attr_map = {}
for _ in tasks_json['taskInfo']:
    task_id_to_attr_map[_['taskId']] = _['taskName'].split(":")[1].strip()

# Object And Attribute/TaskID Intersection

In [None]:
plt.figure(figsize=(10, 2))
plt.subplot(1, 2, 1)
plt.title('Task Attribute Intersection')
sns.heatmap(df_attr_intersection, annot=True)
plt.subplot(1, 2, 2)
plt.title('Task ID Intersection')
sns.heatmap(df_id_intersection, annot=True)
plt.show()

# Import Train and Valid Data

In [None]:
train_data = json.load(open(config.TRAIN_DATA_FILE))
downloaded_train_data = os.listdir(config.TRAIN_IMAGES_DIR)
train_rows = []
for annotation in tqdm(train_data['annotations']):
    if annotation['imageId'] + ".jpg" in downloaded_train_data:
        train_rows.append([
            annotation['imageId'], 
            annotation['labelId'], 
            annotation['taskId']
        ])
df_train = pd.DataFrame(train_rows, columns=['imageId', 'labelId', 'taskId'])
valid_data = json.load(open(config.VALID_DATA_FILE))
downloaded_valid_data = os.listdir(config.VALID_IMAGES_DIR)
valid_rows = []
for annotation in tqdm(valid_data['annotations']):
    if annotation['imageId'] + ".jpg" in downloaded_valid_data:
        valid_rows.append([
            annotation['imageId'], 
            annotation['labelId'], 
            annotation['taskId']
        ])
df_valid = pd.DataFrame(valid_rows, columns=['imageId', 'labelId', 'taskId'])

# Data Dump

In [None]:
for idx, rows in tqdm(df_train.iterrows(), total=len(df_train)):
    df_train.at[idx, 'object'] = task_id_to_object_map[rows['taskId']]
    df_train.at[idx, 'attribute'] = task_id_to_attr_map[rows['taskId']]
    df_train.at[idx, 'label'] = label_id_to_label_map[rows['labelId']]

In [None]:
len(df_valid)

In [None]:
for idx, rows in tqdm(df_valid.iterrows(), total=len(df_valid)):
    df_valid.at[idx, 'object'] = task_id_to_object_map[rows['taskId']]
    df_valid.at[idx, 'attribute'] = task_id_to_attr_map[rows['taskId']]
    df_valid.at[idx, 'label'] = label_id_to_label_map[rows['labelId']]

In [None]:
import _pickle as pkl
pkl.dump(df_train, open('./data/df_train.dump.pkl', 'wb'))
pkl.dump(df_valid, open('./data/df_valid.dump.pkl', 'wb'))

# Task Distribution

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
df_train.taskId.value_counts().sort_index().plot(kind='bar')
plt.title('Task Distribution in Training Data')
plt.xlabel('taskId')
plt.ylabel('#instances')
plt.subplot(1, 2, 2)
df_valid.taskId.value_counts().sort_index().plot(kind='bar')
plt.title('Task Distribution in Validation Data')
plt.xlabel('taskId')
plt.ylabel('#instances')
plt.show()

# Task Label Intersection

In [None]:
task_ids = df_train.taskId.unique().tolist()
task_label_rows = []
for i in task_ids:
    row = []
    for j in task_ids:
        intersection = set(
            df_train[df_train.taskId == i].labelId.unique()
        ).intersection(
            df_train[df_train.taskId == j].labelId.unique()
        )
        row.append(len(intersection))
    task_label_rows.append(row)

In [None]:
tasks = [id_to_task_dict[_] for _ in task_ids]
df_task_label_intersection = pd.DataFrame(task_label_rows, index=tasks, columns=tasks)
plt.figure(figsize=(11, 10))
plt.title('Task Label Intersection')
sns.heatmap(df_task_label_intersection, annot=False)
plt.show()

# Task Label Distribution

In [None]:
for task in df_train.taskId.sort_values().unique().tolist():
    plt.figure(figsize=(20, 2))
    plt.subplot(1, 2, 1)
    df_train[df_train.taskId == task].labelId.value_counts().sort_index().plot(kind='bar')
    plt.title('Label Distribution in Training Data for Task: ' + str(task) + ' (' + id_to_task_dict[task] + ')')
    plt.xlabel('labelId')
    plt.ylabel('#instances')
    plt.subplot(1, 2, 2)
    df_valid[df_valid.taskId == task].labelId.value_counts().sort_index().plot(kind='bar')
    plt.title('Label Distribution in Validation Data for Task: ' + str(task) + ' (' + id_to_task_dict[task] + ')')
    plt.xlabel('labelId')
    plt.ylabel('#instances')
    plt.show()

# Image Dimension Distribution

In [None]:
width = []
height = []
channels = []
for _ in tqdm(df_train.imageId.unique().tolist()):
    img = cgv2.imread(_)
    try:
        x, y, z = img.shape
    except AttributeError:
        continue
    width.append(y)
    height.append(x)
    channels.append(z)

In [None]:
width_new = []
height_new = []
channels_new = []
for _ in tqdm(df_train.imageId.unique().tolist()):
    _ = _.replace(config.TRAIN_IMAGES_DIR, config.RESIZED_TRAIN_DIR)
    img = cv2.imread(_)
    try:
        x, y, z = img.shape
    except AttributeError:
        continue
    width_new.append(y)
    height_new.append(x)
    channels_new.append(z)

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1, 3, 1)
plt.title("Height Distribution")
plt.ylabel('Heights')
plt.xlabel('Instance #')
plt.plot(list(range(len(height))), height, 'b', list(range(len(height_new))), height_new, 'r')
plt.subplot(1, 3, 2)
plt.title("Width Distribution")
plt.ylabel('Width')
plt.xlabel('Instance #')
plt.plot(list(range(len(width))), width, 'b', list(range(len(width_new))), width_new, 'r')
plt.subplot(1, 3, 3)
plt.title("Channel Distribution")
plt.ylabel('Channels')
plt.xlabel('Instance #')
plt.plot(list(range(len(channels))), channels, 'b', list(range(len(channels_new))), channels_new, 'r')
plt.show()

# Image Dimension Statistics

In [None]:
df_image_stats = pd.DataFrame.from_dict({'height': height, 'width': width, 'channel': channels})
df_resized_stats = pd.DataFrame.from_dict({'height': height_new, 'width': width_new, 'channel': channels_new})

In [None]:
cols = ['height', 'width', 'channel']
for df_name, df in {'Original': df_image_stats, 'Resized': df_resized_stats}.items():
    print(df_name)
    for _ in cols:
        print("\t" + _)
        print("\t\tMax", df[_].max())
        print("\t\tMin", df[_].min())
        print("\t\tMean", df[_].mean())
        print("\t\tMedian", df[_].median())
        print("\t\tMode", df[_].mode()[0])

# Conclusions

- There are a total of 4 objects - **['dress', 'outerwear', 'pants', 'shoe']** over which the attributes are to be predicted.
- **Attributes intersect** between two given objects but the **task IDs assigned to these attributes are exclusive**. Because of this it would be possible to train a classifier at object level.
- Labels are practically exclusive (with **limited intersections**) since the values on-diagonal are much greater than those off-diagonal
- The **validation split is stratified** as the distributions of **task IDs over the training and testing images**, and **labels over task IDs** is similar.
- Image dimensions suggest that **reshaping** would be required as the input to pretrained models in keras is around **299*299** and training a bigger image set would also required higher system configuration.

# Model Recommendations

- Since the attributes are related to objects, **use pretrained Image-VGG model weights** to re-train the 4 class object classifier.
- Better to use **Xception Model** with the current system configuration and time constraints.
- Based on the reported accuracies **InceptionResNetV2** would be the ideal choice.
- As a next step there has to be **individual classifier for most of the tasks**.
- Keep as **N/A class for each classifier to assign negative samples**.
- Consider the final probability as the **conditional probability** of an attribute given the probability of an object
- **Note**: The image URLs can be used as the **leaky feature** for the attribute classification task