In [1]:
import numpy as np
import pandas as pd

# Load data

In [4]:
tab = pd.read_csv("./data/oxford/annotations/list.txt", sep=' ', skiprows=[0, 1, 2, 3, 4, 5], names=['ImageName', 'ClassID', 'Species', 'BreedID'])

In [5]:
tab.head()

Unnamed: 0,ImageName,ClassID,Species,BreedID
0,Abyssinian_100,1,1,1
1,Abyssinian_101,1,1,1
2,Abyssinian_102,1,1,1
3,Abyssinian_103,1,1,1
4,Abyssinian_104,1,1,1


In [6]:
tab_cat = tab.loc[tab['Species'] == 1, :]

In [8]:
tab_cat.loc[:, 'BreedName'] = ['_'.join(imn.split('_')[:-1]) for imn in tab_cat['ImageName']]

In [9]:
tab_cat['BreedName'].unique()

array(['Abyssinian', 'Bengal', 'Birman', 'Bombay', 'British_Shorthair',
       'Egyptian_Mau', 'Maine_Coon', 'Persian', 'Ragdoll', 'Russian_Blue',
       'Siamese', 'Sphynx'], dtype=object)

# Generate condition matrix

In [10]:
breed_list = tab_cat['BreedName'].unique()

In [13]:
df_list = []

for imn, breed_id, breed0 in zip(tab_cat['ImageName'].values, tab_cat['BreedID'].values, tab_cat['BreedName'].values):
    df_dict = {'ImageName' : [imn], 'BreedID' : [breed_id], 'BreedName' : [breed0]}
    
    for breed in breed_list:
        digit = 1 if breed == breed0 else 0
        df_dict.update({breed : [digit]})
    df = pd.DataFrame(df_dict)
    df_list.append(df)

tab_cat_2 = pd.concat(df_list)

In [14]:
tab_cat_2.head()

Unnamed: 0,ImageName,BreedID,BreedName,Abyssinian,Bengal,Birman,Bombay,British_Shorthair,Egyptian_Mau,Maine_Coon,Persian,Ragdoll,Russian_Blue,Siamese,Sphynx
0,Abyssinian_100,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0
0,Abyssinian_101,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0
0,Abyssinian_102,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0
0,Abyssinian_103,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0
0,Abyssinian_104,1,Abyssinian,1,0,0,0,0,0,0,0,0,0,0,0


# Split to training and test data

In [15]:
from sklearn.model_selection import StratifiedKFold

In [20]:
kcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

pseudoX = np.zeros((tab_cat_2.shape[0], 2))
y = tab_cat_2['BreedID'].values - 1

idx_train, idx_test = kcv.split(pseudoX, y).__next__()

In [21]:
tab_cat_train = tab_cat_2.iloc[idx_train, :]
tab_cat_test = tab_cat_2.iloc[idx_test, :]

In [22]:
tab_cat_train['BreedName'].value_counts()

Ragdoll              180
Persian              180
Sphynx               180
Maine_Coon           180
Russian_Blue         180
British_Shorthair    180
Bengal               180
Birman               180
Siamese              179
Abyssinian           178
Egyptian_Mau         171
Bombay               165
Name: BreedName, dtype: int64

In [23]:
tab_cat_test['BreedName'].value_counts()

Maine_Coon           20
Ragdoll              20
Persian              20
Abyssinian           20
British_Shorthair    20
Birman               20
Sphynx               20
Russian_Blue         20
Bengal               20
Siamese              20
Egyptian_Mau         19
Bombay               19
Name: BreedName, dtype: int64

In [24]:
tab_cat_train.to_csv("./data/oxford/annotations/cond_tab_training.csv", index=False)
tab_cat_test.to_csv("./data/oxford/annotations/cond_tab_test.csv", index=False)