In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
num_test_images = len(os.listdir("../input/test"))
num_train_images = len(os.listdir("../input/train"))

print("Number of images in test set: {}".format(num_test_images))
print("Number of images in train set: {}".format(num_train_images))

Number of images in test set: 3200
Number of images in train set: 45625


In [3]:
train_df = pd.read_csv("../input/train.csv")
train_df.head()

Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,8521389 10 8526585 30 8531789 42 8537002 46 85...,5214,3676,28
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,12903854 2 12909064 7 12914275 10 12919485 15 ...,5214,3676,31
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,10837337 5 10842542 14 10847746 24 10852951 33...,5214,3676,32


In [5]:
avg_class_per_image = np.round(train_df.shape[0]/num_train_images, 2)
print("Average number of classes per image: {}".format(avg_class_per_image))

assert len(train_df["ImageId"].value_counts()) == num_train_images
print("Every image has at least 1 class")

Average number of classes per image: 7.31
Every image has at least 1 class


In [8]:
train_df["fine_grained"] = train_df["ClassId"].apply(lambda x: len(x.split("_"))) > 1
train_df["main_class"] = train_df["ClassId"].apply(lambda x: x.split("_")[0])

fine_grained_obj_perc = np.round(train_df["fine_grained"].mean()*100, 1)
print("{}% of the objects are fine-grained.".format(fine_grained_obj_perc))

3.5% of the objects are fine-grained.


In [9]:
fine_grained_img_perc = np.round((train_df.groupby("ImageId")["fine_grained"].sum() > 0).mean()*100, 1)
print("{}% of the images have at least one fine-grained object.".format(fine_grained_img_perc))

14.7% of the images have at least one fine-grained object.


In [10]:
class_df = train_df.groupby("main_class").agg({"fine_grained": "mean", "ImageId": "count"}).reset_index()
class_df = class_df.rename(columns={"ImageId": "img_count"})
print("Number of classes: {}".format(class_df.shape[0]))

Number of classes: 46


In [11]:
print("{} of the classes are never fine-grained.".format((class_df["fine_grained"] == 0).sum()))

30 of the classes are never fine-grained.


In [13]:
perc = np.round(100*class_df[class_df["fine_grained"] == 0]["img_count"].sum()/train_df.shape[0], 1)
print("{}% of the objects are from non-fine-grained classes.".format(perc))

63.2% of the objects are from non-fine-grained classes.
