In [33]:
#import necessary libraries 
import pandas as pd 
import os
import cv2 as cv
import numpy as np

In [34]:
#load attributes csv in pandas dataframe
attributes = pd.read_csv('attributes.csv')

In [35]:
attributes

Unnamed: 0,filename,neck,sleeve_length,pattern
0,cdc8cd2a-0938-4970-a3b5-f5ed9595222c1527925869...,6.0,,4.0
1,11469770662809-Metersbonwe-Navy-T-shirt-485146...,5.0,3.0,9.0
2,11479107741104-Tommy-Hilfiger-Men-Navy-Blue-St...,6.0,1.0,9.0
3,f7ad67ab-eeb1-4449-8f63-7b580d2797e71532342804...,,0.0,9.0
4,11516770810185-Splash-Men-Tshirts-767151677081...,6.0,3.0,9.0
...,...,...,...,...
2233,11507360109998-Nautica-Men-Blue-Striped-V-Neck...,5.0,3.0,9.0
2234,8f236dcd-4a2b-49ff-9229-981e7db946f91537273774...,6.0,,5.0
2235,15cb873b-993c-422c-b9c1-96d59580fbae1535360301...,6.0,3.0,3.0
2236,bf72a615-0db6-4400-903b-6c2aa1e5831d1530608800...,6.0,3.0,1.0


In [36]:
#check for number of classes in each attributes
attributes.max()

filename         ffb8b81b-e9a3-44d7-ac7a-1699d43be4f11530878498...
neck                                                             6
sleeve_length                                                    3
pattern                                                          9
dtype: object

In [37]:
#check for NaN values
attributes.isnull().sum()

filename           0
neck             455
sleeve_length    452
pattern          447
dtype: int64

In [38]:
#since a large number of NaN values are present hence it is not possible drop them, also can't fill with most frequent class
#as that will increase bias against minority classes, randomly fill NaN values with value(except majority class to decrease bias)
attributes['neck'].fillna(value = np.random.randint(0,5), inplace = True)
attributes['sleeve_length'].fillna(value = np.random.randint(0,2), inplace = True)
attributes['pattern'].fillna(value = np.random.randint(0,8), inplace =True)

In [39]:
attributes.isna().sum()

filename         0
neck             0
sleeve_length    0
pattern          0
dtype: int64

In [40]:
#number of sample of each class for each attribute
attributes['neck'].value_counts(), attributes['sleeve_length'].value_counts(), attributes['pattern'].value_counts()

(6.0    1016
 1.0     591
 4.0     134
 2.0     132
 0.0     125
 5.0     125
 3.0     115
 Name: neck, dtype: int64,
 3.0    1369
 1.0     592
 2.0     148
 0.0     129
 Name: sleeve_length, dtype: int64,
 9.0    1467
 1.0     484
 6.0      52
 3.0      46
 5.0      38
 4.0      37
 7.0      32
 8.0      32
 2.0      29
 0.0      21
 Name: pattern, dtype: int64)

In [41]:
#first 2 classes  in each attribute are creating bias, drop rows with any one of these values for each attribute
#to reduce bias
for index in attributes.index:
    if (attributes['neck'][index] == 6.0 or attributes['neck'][index] == 1.0)  and (attributes['sleeve_length'][index] == 3.0 or attributes['sleeve_length'][index] == 1.0) and (attributes['pattern'][index] == 9.0 or attributes['pattern'][index] == 1.0):
        attributes.drop(index, axis = 0, inplace = True)

In [42]:
attributes['neck'].value_counts(), attributes['sleeve_length'].value_counts(), attributes['pattern'].value_counts()

(6.0    420
 1.0    144
 4.0    134
 2.0    132
 0.0    125
 5.0    125
 3.0    115
 Name: neck, dtype: int64,
 3.0    720
 1.0    198
 2.0    148
 0.0    129
 Name: sleeve_length, dtype: int64,
 9.0    722
 1.0    186
 6.0     52
 3.0     46
 5.0     38
 4.0     37
 7.0     32
 8.0     32
 2.0     29
 0.0     21
 Name: pattern, dtype: int64)

In [43]:
#1195 rows left out of which, images aren't available for all
attributes

Unnamed: 0,filename,neck,sleeve_length,pattern
0,cdc8cd2a-0938-4970-a3b5-f5ed9595222c1527925869...,6.0,1.0,4.0
1,11469770662809-Metersbonwe-Navy-T-shirt-485146...,5.0,3.0,9.0
3,f7ad67ab-eeb1-4449-8f63-7b580d2797e71532342804...,1.0,0.0,9.0
6,11519111005679-United-Colors-of-Benetton-Men-B...,5.0,3.0,9.0
7,11461827372049-US-Polo-Assn-Navy-T-shirt-20714...,5.0,3.0,9.0
...,...,...,...,...
2232,11495782729779-Roadster-Men-Black-Printed-V-Ne...,5.0,3.0,9.0
2233,11507360109998-Nautica-Men-Blue-Striped-V-Neck...,5.0,3.0,9.0
2234,8f236dcd-4a2b-49ff-9229-981e7db946f91537273774...,6.0,1.0,5.0
2235,15cb873b-993c-422c-b9c1-96d59580fbae1535360301...,6.0,3.0,3.0


In [44]:
#split into train, test, val
x_train = []
x_test = []
x_val =[]
train_attributes = pd.DataFrame(columns = attributes.columns)
test_attributes = pd.DataFrame(columns = attributes.columns)
val_attributes = pd.DataFrame(columns = attributes.columns)

In [45]:
#split into individual folders of train test and val, and CSVs of train test and val
train_count = 1
test_count = 1
val_count = 1
path_train = './split_data/train'
path_val = './split_data/val'
path_test = './split_data/test'
for img in os.listdir('./images'):
    img_ = cv.imread(os.path.join('./images', img), 1)
    temp = attributes[attributes['filename'] == img].index
    if len(temp) == 0:
        continue
    if len(temp) > 1:
        temp = temp[0:1]
    temp1 = attributes.loc[temp]

#save 100 images for validation
    if val_count<=100:
        x_val.append(img_)
        val_attributes = val_attributes.append(temp1, ignore_index = True)
        cv.imwrite(os.path.join(path_val, img+'.jpg'), img_)
        val_count = val_count + 1
#save 100 images for testing
    elif test_count<=100:
        x_test.append(img_)
        test_attributes = test_attributes.append(temp1, ignore_index = True)
        cv.imwrite(os.path.join(path_test, img + '.jpg'), img_)
        test_count = test_count + 1
#save rest for training
    else:
        x_train.append(img_)
        train_attributes = train_attributes.append(temp1, ignore_index = True)
        cv.imwrite(os.path.join(path_train, img+'.jpg'), img_)
        train_count = train_count + 1

In [46]:
x_train = np.array(x_train)
x_test = np.array(x_test)
x_val = np.array(x_val)

In [47]:
x_train.shape, x_test.shape, x_val.shape

((748, 300, 225, 3), (100, 300, 225, 3), (100, 300, 225, 3))

In [48]:
train_attributes['neck'].value_counts(), train_attributes['sleeve_length'].value_counts(),train_attributes['pattern'].value_counts()

(6.0    284
 1.0     89
 4.0     82
 2.0     76
 5.0     74
 0.0     72
 3.0     71
 Name: neck, dtype: int64,
 3.0    452
 1.0    125
 2.0     97
 0.0     74
 Name: sleeve_length, dtype: int64,
 9.0    432
 1.0    114
 6.0     35
 3.0     35
 5.0     31
 4.0     28
 7.0     21
 8.0     20
 2.0     17
 0.0     15
 Name: pattern, dtype: int64)

In [49]:
#save train, test, val split attributes csv
train_attributes.to_csv('./split_data/'+'train_attributes.csv', index = False)
test_attributes.to_csv('./split_data/'+'test_attributes.csv', index = False)
val_attributes.to_csv('./split_data/'+'val_attributes.csv', index = False)