## Goal:
- Sample 20K images from Training set
- Sample 2K images from Test set 

## Problems: 
1. filenames in train set start from 0 to 109998
2. filenames in the test set start from 10 to 109999

In [1]:
#!pip install pillow

In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image

In [3]:
labels_train = pd.read_csv('./labelsHonda100k_train.csv', header = None)
labels_test = pd.read_csv('./labelsHonda100k_val.csv', header = None)

print(labels_train.head())
print(labels_test.head())

       0   1   2         3
0  0.jpg NaN NaN -0.252324
1  1.jpg NaN NaN -0.252324
2  2.jpg NaN NaN -0.205843
3  3.jpg NaN NaN -0.146082
4  4.jpg NaN NaN -0.139442
        0   1   2         3
0  10.jpg NaN NaN -0.139442
1  21.jpg NaN NaN  0.132802
2  32.jpg NaN NaN -0.212483
3  43.jpg NaN NaN -0.159363
4  54.jpg NaN NaN -0.006640


In [4]:
columns = [0,3]
data_train = labels_train[columns]
data_test = labels_test[columns]

print(data_train.head())
print(data_test.head())

       0         3
0  0.jpg -0.252324
1  1.jpg -0.252324
2  2.jpg -0.205843
3  3.jpg -0.146082
4  4.jpg -0.139442
        0         3
0  10.jpg -0.139442
1  21.jpg  0.132802
2  32.jpg -0.212483
3  43.jpg -0.159363
4  54.jpg -0.006640


In [5]:
#There are some missing image names
#Randomly sample 20,000 for train and 2,000 from val

train_size = 10000
test_size = 1000

permutations_train = np.random.randint(0, len(data_train), train_size)
permutations_test = np.random.randint(0, len(data_test), train_size)

print(f"{permutations_train}\n{permutations_test}")

dir_path = './small_data_honda'
if not os.path.exists(dir_path):
    os.mkdir(dir_path)

[95534  5169 82830 ... 96256 64551 26469]
[9565 1283 3070 ... 4470 9616  363]


In [6]:
# First get the columns, then based on that get the files
#train_images = []
train_labels = []

train_path = './small_data_honda/train_images'
if not os.path.exists(train_path):
    os.mkdir(train_path)
    

for i in range(len(permutations_train)):
    #print(i)
    name = data_train[0][permutations_train[i]]
    img = Image.open(f'./trainHonda100k/{name}')
    img.save(f'./small_data_honda/train_images/{i}.jpg')
    train_labels.append([f"{i}.jpg", data_train[3][permutations_train[i]]])
    
train_labels = np.array(train_labels)
print(train_labels.shape)

(10000, 2)


In [7]:
test_images = []
test_labels = []

test_path = './small_data_honda/test_images'
if not os.path.exists(test_path):
    os.mkdir(test_path)
    
for i in permutations_test:
    #print(i)
    name = data_test[0][permutations_test[i]]
    img = Image.open(f'./valHonda100k/{name}')
    img.save(f'./small_data_honda/test_images/{i}.jpg')
    test_labels.append([f"{i}.jpg", data_test[3][permutations_test[i]]])
    
test_labels = np.array(test_labels)
print(test_labels.shape)

(10000, 2)


In [9]:
np.save("./small_data_honda/train_labels.npy", train_labels)
np.save("./small_data_honda/test_labels.npy", test_labels)