In [1]:
import os
import shutil
import pandas as pd

from sklearn.model_selection import StratifiedKFold

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,Image,Class
0,image3476.jpg,Miscellaneous
1,image5198.jpg,Candle
2,image4183.jpg,Snowman
3,image1806.jpg,Miscellaneous
4,image7831.jpg,Miscellaneous


In [3]:
train_df['Class'].value_counts()

Miscellaneous     2801
Christmas_Tree    1539
Jacket             640
Candle             593
Airplane           535
Snowman            361
Name: Class, dtype: int64

In [4]:
classes = pd.DataFrame(train_df['Class'].unique(), columns = ['class_of_image'])
classes

Unnamed: 0,class_of_image
0,Miscellaneous
1,Candle
2,Snowman
3,Airplane
4,Christmas_Tree
5,Jacket


In [5]:
train_df['kfold'] = -1
train_df.head()

Unnamed: 0,Image,Class,kfold
0,image3476.jpg,Miscellaneous,-1
1,image5198.jpg,Candle,-1
2,image4183.jpg,Snowman,-1
3,image1806.jpg,Miscellaneous,-1
4,image7831.jpg,Miscellaneous,-1


In [6]:
train_df = train_df.sample(frac=1).reset_index(drop = True)
train_df.tail()

Unnamed: 0,Image,Class,kfold
6464,image8954.jpg,Miscellaneous,-1
6465,image8969.jpg,Miscellaneous,-1
6466,image1390.jpg,Miscellaneous,-1
6467,image3437.jpg,Miscellaneous,-1
6468,image5742.jpg,Christmas_Tree,-1


In [7]:
y = train_df['Class']

In [8]:
kf = StratifiedKFold(n_splits = 5)

In [9]:
for f,(t_,v_) in enumerate(kf.split(X=train_df,y=y)):
    train_df.loc[v_,'kfold']=f
train_df.head()

Unnamed: 0,Image,Class,kfold
0,image2779.jpg,Miscellaneous,0
1,image4970.jpg,Christmas_Tree,0
2,image4351.jpg,Christmas_Tree,0
3,image8563.jpg,Miscellaneous,0
4,image8553.jpg,Christmas_Tree,0


In [10]:
train = train_df[train_df['kfold']!=4].reset_index(drop = True)
valid = train_df[train_df['kfold'] == 4].reset_index(drop = True)

In [11]:
train.head()

Unnamed: 0,Image,Class,kfold
0,image2779.jpg,Miscellaneous,0
1,image4970.jpg,Christmas_Tree,0
2,image4351.jpg,Christmas_Tree,0
3,image8563.jpg,Miscellaneous,0
4,image8553.jpg,Christmas_Tree,0


In [12]:
valid.head()

Unnamed: 0,Image,Class,kfold
0,image3589.jpg,Snowman,4
1,image5114.jpg,Snowman,4
2,image6093.jpg,Jacket,4
3,image4791.jpg,Snowman,4
4,image5092.jpg,Snowman,4


In [13]:
valid['Class'].value_counts()

Miscellaneous     560
Christmas_Tree    308
Jacket            128
Candle            118
Airplane          107
Snowman            72
Name: Class, dtype: int64

In [14]:
for i in range(classes.shape[0]):
    os.makedirs(f'Training/{classes["class_of_image"][i]}', exist_ok = True)

In [15]:
for i in range(classes.shape[0]):
    os.makedirs(f'Validation/{classes["class_of_image"][i]}', exist_ok = True)

In [16]:
for i in range(len(train)):
    cls = train.loc[i]['Class']
    img = train.loc[i]['Image']
    source = f'train/' + img
    dest = f'Training/' + cls + '/'
    shutil.copy(source, dest)

In [17]:
for i in range(len(valid)):
    cls = valid.loc[i]['Class']
    img = valid.loc[i]['Image']
    source = f'train/' + img
    dest = f'Validation/' + cls + '/'
    shutil.copy(source, dest)

In [20]:
test_img = []
for img in os.listdir(path = 'test/'):
    test_img.append(img)

In [21]:
test_df = pd.DataFrame(test_img, columns = ['Image'])

In [22]:
test_df.head()

Unnamed: 0,Image
0,image10.jpg
1,image100.jpg
2,image1013.jpg
3,image1014.jpg
4,image1018.jpg


In [24]:
test_df.to_csv('test.csv', index = False)