# 데이터 추출하기

### 전체 train 이미지 데이터 중 50000개 추출 및 test 이미지 10000개 추출

In [1]:
import pandas as pd

text_train = pd.read_csv('./data/train.csv')
text_train.head()

Unnamed: 0,ID,image_id,question,answer
0,TRAIN_000000,train_000000,Is this in the wild or zoo?,zoo
1,TRAIN_000001,train_000001,Is this car a recent model?,yes
2,TRAIN_000002,train_000002,Is the man being safe?,yes
3,TRAIN_000003,train_000003,What are the walls made from?,drywall
4,TRAIN_000004,train_000004,How many players are sitting on the bench?,8


In [2]:
train_path_50000 = text_train.image_id.value_counts()[:50000].index.to_list()
test_path_10000 = text_train.image_id.value_counts()[50000:60000].index.to_list()

### train 이미지 50000개 폴더 생성 및 이미지 파일 복사

In [None]:
import shutil
import os
from tqdm import tqdm

# Train folder
src = './data/image/train/'
dst = './data/image/train_50000/'

if not os.path.exists(dst):
  os.makedirs(dst)

for image in tqdm(train_path_50000):
    source = src + image + '.jpg'
    destination = dst + image + '.jpg'

    shutil.copyfile(source, destination)

In [None]:
# Test folder
src = './data/image/train/'
dst = './data/image/test_10000/'

if not os.path.exists(dst):
  os.makedirs(dst)

for image in tqdm(test_path_10000):
    source = src + image + '.jpg'
    destination = dst + image + '.jpg'

    shutil.copyfile(source, destination)

### CSV 파일 추출

In [4]:
# Train text 파일 추출
train_50000_text = text_train[text_train.image_id.isin(train_path_50000)].reset_index(drop=True)
train_50000_text.to_csv('./data/train_50000.csv', index=False)
train_50000_text.head()

Unnamed: 0,ID,image_id,question,answer
0,TRAIN_000000,train_000000,Is this in the wild or zoo?,zoo
1,TRAIN_000001,train_000001,Is this car a recent model?,yes
2,TRAIN_000002,train_000002,Is the man being safe?,yes
3,TRAIN_000003,train_000003,What are the walls made from?,drywall
4,TRAIN_000004,train_000004,How many players are sitting on the bench?,8


In [7]:
test_10000_text = text_train[text_train.image_id.isin(test_path_10000)].reset_index(drop=True)
# test_10000_text.to_csv('./data/test_10000.csv', index=False)
test_10000_text.tail()

Unnamed: 0,ID,image_id,question,answer
25625,TRAIN_359439,train_071559,Is the scene set in South America?,no
25626,TRAIN_359444,train_015134,Is the airplane passenger over land or sea?,sea
25627,TRAIN_359454,train_105545,What is the cat doing?,staring
25628,TRAIN_359467,train_106322,What are they standing by?,fire hydrant
25629,TRAIN_359468,train_105397,Does this giraffe live in the savanna?,yes


---

# 데이터 추출하기 2

### yes/no 답변 이미지 대상 train 10000개 & test 2000개 추출

In [8]:
train_path_10000 = text_train[text_train.answer.isin(['yes', 'no'])].image_id.value_counts()[:10000].index.to_list()
test_path_2000 = text_train[text_train.answer.isin(['yes', 'no'])].image_id.value_counts()[10000:12000].index.to_list()

### Train 이미지 10000개 폴더 생성 및 파일 복사

In [None]:
# Train folder2
src = './data/image/train/'
dst = './data/image/train_10000/'

if not os.path.exists(dst):
  os.makedirs(dst)

for image in tqdm(train_path_10000):
    source = src + image + '.jpg'
    destination = dst + image + '.jpg'

    shutil.copyfile(source, destination)

In [None]:
# Test folder2
src = './data/image/train/'
dst = './data/image/test_2000/'

if not os.path.exists(dst):
  os.makedirs(dst)

for image in tqdm(test_path_2000):
    source = src + image + '.jpg'
    destination = dst + image + '.jpg'

    shutil.copyfile(source, destination)

### CSV 파일 추출

In [9]:
train_10000_text = text_train[text_train.answer.isin(['yes', 'no']) & text_train.image_id.isin(train_path_10000)].reset_index(drop=True)
train_10000_text.to_csv('./data/train_10000.csv', index=False)
train_10000_text

Unnamed: 0,ID,image_id,question,answer
0,TRAIN_000001,train_000001,Is this car a recent model?,yes
1,TRAIN_000002,train_000002,Is the man being safe?,yes
2,TRAIN_000012,train_000012,Is the suitcase closed?,no
3,TRAIN_000022,train_000022,Are any cars visible?,no
4,TRAIN_000040,train_000040,Is it nighttime?,yes
...,...,...,...,...
46377,TRAIN_359495,train_001974,Is a skateboarder wearing striped socks?,no
46378,TRAIN_359506,train_020730,Is there a bench in this photo?,yes
46379,TRAIN_359515,train_025786,Is this a sunny day?,yes
46380,TRAIN_359516,train_051740,Are those cupcakes?,no


In [10]:
test_2000_text = text_train[text_train.answer.isin(['yes', 'no']) & text_train.image_id.isin(test_path_2000)].reset_index(drop=True)
test_2000_text.to_csv('./data/test_2000.csv', index=False)
test_2000_text

Unnamed: 0,ID,image_id,question,answer
0,TRAIN_000125,train_000125,Are the cars in motion?,yes
1,TRAIN_000175,train_000175,Is the person smiling?,no
2,TRAIN_000178,train_000178,Is this creature curious about the thing looki...,yes
3,TRAIN_000195,train_000195,Is the screen on?,yes
4,TRAIN_000224,train_000223,Will the cat be eating later?,yes
...,...,...,...,...
5995,TRAIN_359083,train_021061,Is the horse moving?,yes
5996,TRAIN_359094,train_014637,Do you see a beach?,yes
5997,TRAIN_359265,train_074277,Is the train on a mountain?,yes
5998,TRAIN_359335,train_006738,Does this animal appear to be in the wild?,no
