In [2]:
import numpy as np
import pandas as pd

# Load data

In [3]:
tab = pd.read_csv("./data/oxford/annotations/list.txt", sep=' ', skiprows=[0, 1, 2, 3, 4, 5], names=['ImageName', 'ClassID', 'Species', 'BreedID'])

In [4]:
tab.head()

Unnamed: 0,ImageName,ClassID,Species,BreedID
0,Abyssinian_100,1,1,1
1,Abyssinian_101,1,1,1
2,Abyssinian_102,1,1,1
3,Abyssinian_103,1,1,1
4,Abyssinian_104,1,1,1


In [5]:
tab_cat = tab.loc[tab['Species'] == 1, :]

In [6]:
tab_cat.loc[:, 'BreedName'] = ['_'.join(imn.split('_')[:-1]) for imn in tab_cat['ImageName']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [7]:
tab_cat['BreedName'].unique()

array(['Abyssinian', 'Bengal', 'Birman', 'Bombay', 'British_Shorthair',
       'Egyptian_Mau', 'Maine_Coon', 'Persian', 'Ragdoll', 'Russian_Blue',
       'Siamese', 'Sphynx'], dtype=object)

# Generate condition matrix

In [8]:
breed_list = tab_cat['BreedName'].unique()

In [9]:
df_list = []

for imn, breed_id, breed0 in zip(tab_cat['ImageName'].values, tab_cat['BreedID'].values, tab_cat['BreedName'].values):
    df_dict = {'ImageName' : [imn], 'BreedID' : [breed_id], 'BreedName' : [breed0]}
    
    for breed in breed_list:
        digit = 1 if breed == breed0 else 0
        df_dict.update({breed : [digit]})
    df = pd.DataFrame(df_dict)
    df_list.append(df)

tab_cat_2 = pd.concat(df_list)

In [10]:
tab_cat_2.head()

Unnamed: 0,ImageName,BreedID,BreedName,Abyssinian,Bengal,Birman,Bombay,British_Shorthair,Egyptian_Mau,Maine_Coon,Persian,Ragdoll,Russian_Blue,Siamese,Sphynx
0,Abyssinian_100,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0
0,Abyssinian_101,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0
0,Abyssinian_102,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0
0,Abyssinian_103,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0
0,Abyssinian_104,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0


# Correspond to images cropped by YOLOv3

In [11]:
import os

In [12]:
df_list = []

for img in tab_cat_2['ImageName'].values:
    df = tab_cat_2.loc[tab_cat_2['ImageName'] == img, :]
    
    crop_img_path = "./data/oxford/cropped_cat/{0:s}_detected_0_cat.png".format(img)
    
    if os.path.exists(crop_img_path):
        df.loc[:, 'ImagePath'] = crop_img_path
        df_list.append(df)

tab_cat_3 = pd.concat(df_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


# Split to training and test data

In [14]:
from sklearn.model_selection import StratifiedKFold

In [15]:
kcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

pseudoX = np.zeros((tab_cat_3.shape[0], 2))
y = tab_cat_3['BreedID'].values - 1

idx_train, idx_test = kcv.split(pseudoX, y).__next__()

In [16]:
tab_cat_train = tab_cat_3.iloc[idx_train, :]
tab_cat_test = tab_cat_3.iloc[idx_test, :]

In [17]:
tab_cat_train['BreedName'].value_counts()

Russian_Blue    179
Bengal          179
Ragdoll         175
Siamese         174
Abyssinian      173
Maine_Coon      173
Birman          169
Bombay          161
Persian         146
Sphynx          111
Name: BreedName, dtype: int64

In [18]:
tab_cat_test['BreedName'].value_counts()

Bengal          20
Ragdoll         20
Russian_Blue    20
Abyssinian      20
Siamese         20
Maine_Coon      20
Birman          19
Bombay          18
Persian         17
Sphynx          13
Name: BreedName, dtype: int64

In [19]:
tab_cat_train.to_csv("./data/oxford/annotations/cond_tab_training.csv", index=False)
tab_cat_test.to_csv("./data/oxford/annotations/cond_tab_test.csv", index=False)