In [1]:
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch import optim
from torch import nn

from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from torchvision import transforms

import random
from glob import glob
import pandas as pd
import numpy as np
from PIL import Image

In [2]:
from glob import glob

import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from torchvision import transforms

def extract_day(images):
    day = int(images.split('.')[-2][-2:])
    return day

def make_day_array(images):
    day_array = np.array([extract_day(x) for x in images])
    return day_array

def make_combination(length, species, data_frame, direct_name):
    before_file_path = []
    after_file_path = []
    time_delta = []

    for i in range(length):
        
        # 하위 폴더 중에서 랜덤하게 선택을 한다.
        direct = random.randrange(0,len(direct_name))
        # 위에서 결정된 폴더를 선택한다. 
        temp = data_frame[data_frame['version'] == direct_name[direct]]
    
        # 밑은 기존의 코드와 동일합니다.
        sample = temp[temp['species'] == species].sample(2)
        after = sample[sample['day'] == max(sample['day'])].reset_index(drop=True)
        before = sample[sample['day'] == min(sample['day'])].reset_index(drop=True)

        before_file_path.append(before.iloc[0]['file_name'])
        after_file_path.append(after.iloc[0]['file_name'])
        delta = int(after.iloc[0]['day'] - before.iloc[0]['day'])
        time_delta.append(delta)

    combination_df = pd.DataFrame({
        'before_file_path': before_file_path,
        'after_file_path': after_file_path,
        'time_delta': time_delta,
    })

    combination_df['species'] = species

    return combination_df

class KistDataset(Dataset):
    def __init__(self, combination_df, is_test=None):
        self.combination_df = combination_df
        self.transform = transforms.Compose([
            transforms.ToTensor()
        ])
        self.is_test = is_test

    def __getitem__(self, idx):
        before_image = Image.open(self.combination_df.iloc[idx]['before_file_path'])
        after_image = Image.open(self.combination_df.iloc[idx]['after_file_path'])

        before_image = self.transform(before_image)
        after_image = self.transform(after_image)
        if self.is_test:
            return before_image, after_image
        time_delta = self.combination_df.iloc[idx]['time_delta']
        return before_image, after_image, time_delta

    def __len__(self):
        return len(self.combination_df)

In [3]:
import torch
from torch import nn
from torchvision.models import mobilenet_v2


class CompareCNN(nn.Module):

    def __init__(self):
        super(CompareCNN, self).__init__()
        self.mobile_net = mobilenet_v2(pretrained=True)
        self.fc_layer = nn.Linear(1000, 1)

    def forward(self, input):
        x = self.mobile_net(input)
        output = self.fc_layer(x)
        return output


class CompareNet(nn.Module):

    def __init__(self):
        super(CompareNet, self).__init__()
        self.before_net = CompareCNN()
        self.after_net = CompareCNN()

    def forward(self, before_input, after_input):
        before = self.before_net(before_input)
        after = self.after_net(after_input)
        delta = before - after
        return delta

In [4]:
def seed_everything(seed): # seed 고정
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)


seed_everything(2048)

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
lr = 1e-5
epochs = 10
batch_size = 64
valid_batch_size = 50

model = CompareNet().to(device)

# 학습 데이터가 있는 폴더 위치
root_path = './drive/MyDrive/open_224/train_dataset/'

# BC 폴더와 LT 폴더에 있는 하위 폴더를 저장한다.
bc_direct = glob(root_path + '/BC/*')
bc_direct_name = [x[-5:] for x in bc_direct]
lt_direct = glob(root_path + '/LT/*')
lt_direct_name = [x[-5:] for x in lt_direct]

# 하위 폴더에 있는 이미지들을 하위 폴더 이름과 매칭시켜서 저장한다.
bc_images = {key : glob(name + '/*.png') for key,name in zip(bc_direct_name, bc_direct)}
lt_images = {key : glob(name + '/*.png') for key,name in zip(lt_direct_name, lt_direct)}

# 하위 폴더에 있는 이미지들에서 날짜 정보만 따로 저장한다.
bc_dayes = {key : make_day_array(bc_images[key]) for key in bc_direct_name}
lt_dayes = {key : make_day_array(lt_images[key]) for key in lt_direct_name}

bc_dfs = []
for i in bc_direct_name:
    bc_df = pd.DataFrame({
        'file_name':bc_images[i],
        'day':bc_dayes[i],
        'species':'bc',
        'version':i
    })
    bc_dfs.append(bc_df)
    
lt_dfs = []
for i in lt_direct_name:
    lt_df = pd.DataFrame({
        'file_name':lt_images[i],
        'day':lt_dayes[i],
        'species':'lt',
        'version':i
    })
    lt_dfs.append(lt_df)

bc_dataframe = pd.concat(bc_dfs).reset_index(drop=True)
lt_dataframe = pd.concat(lt_dfs).reset_index(drop=True)
total_dataframe = pd.concat([bc_dataframe, lt_dataframe]).reset_index(drop=True)

bc_combination = make_combination(5000, 'bc', total_dataframe, bc_direct_name)
lt_combination = make_combination(5000, 'lt', total_dataframe, lt_direct_name)

bc_train = bc_combination.iloc[:4500]
bc_valid = bc_combination.iloc[4500:]

lt_train = lt_combination.iloc[:4500]
lt_valid = lt_combination.iloc[4500:]

train_set = pd.concat([bc_train, lt_train])
valid_set = pd.concat([bc_valid, lt_valid])



train_dataset = KistDataset(train_set)
valid_dataset = KistDataset(valid_set)

optimizer = optim.Adam(model.parameters(), lr=lr)

train_data_loader = DataLoader(train_dataset,
                               batch_size=batch_size,
                               shuffle=True)

valid_data_loader = DataLoader(valid_dataset,
                               batch_size=valid_batch_size)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth


  0%|          | 0.00/13.6M [00:00<?, ?B/s]

In [5]:
for epoch in tqdm(range(epochs)):
    for step, (before_image, after_image, time_delta) in tqdm(enumerate(train_data_loader)):
        before_image = before_image.to(device)
        after_image = after_image.to(device)
        time_delta = time_delta.to(device)

        optimizer.zero_grad()
        logit = model(before_image, after_image)
        train_loss = (torch.sum(torch.abs(logit.squeeze(1).float() - time_delta.float())) /
                      torch.LongTensor([batch_size]).squeeze(0).to(device))
        train_loss.backward()
        optimizer.step()

        if step % 15 == 0:
            print('\n=====================loss=======================')
            print(f'\n=====================EPOCH: {epoch}=======================')
            print(f'\n=====================step: {step}=======================')
            print('MAE_loss : ', train_loss.detach().cpu().numpy())

    valid_losses = []
    with torch.no_grad():
        for valid_before, valid_after, time_delta in tqdm(valid_data_loader):
            valid_before = valid_before.to(device)
            valid_after = valid_after.to(device)
            valid_time_delta = time_delta.to(device)


            logit = model(valid_before, valid_after)
            valid_loss = (torch.sum(torch.abs(logit.squeeze(1).float() - valid_time_delta.float())) /
                          torch.LongTensor([valid_batch_size]).squeeze(0).to(device))
            valid_losses.append(valid_loss.detach().cpu())


    print(f'VALIDATION_LOSS MAE : {sum(valid_losses)/len(valid_losses)}')
    checkpoint = {
        'model': model.state_dict(),

    }

    torch.save(checkpoint, 'baseline_224_v2.pt')

  0%|          | 0/10 [00:00<?, ?it/s]

0it [00:00, ?it/s]




MAE_loss :  11.371799



MAE_loss :  7.75923



MAE_loss :  6.7372313



MAE_loss :  5.009716



MAE_loss :  3.8559432



MAE_loss :  3.5107956



MAE_loss :  2.6667557



MAE_loss :  2.1462426



MAE_loss :  2.0172544



MAE_loss :  2.0356688


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 2.2665622234344482


0it [00:00, ?it/s]




MAE_loss :  1.3228204



MAE_loss :  1.3841915



MAE_loss :  3.0957398



MAE_loss :  1.4582477



MAE_loss :  1.8226964



MAE_loss :  1.8060827



MAE_loss :  1.1020799



MAE_loss :  1.3985319



MAE_loss :  1.5646075



MAE_loss :  1.4116026


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.8870331048965454


0it [00:00, ?it/s]




MAE_loss :  1.2352881



MAE_loss :  1.6094171



MAE_loss :  1.157188



MAE_loss :  1.1775725



MAE_loss :  1.323187



MAE_loss :  1.3017573



MAE_loss :  1.1107258



MAE_loss :  0.99284244



MAE_loss :  1.1490357



MAE_loss :  2.1449003


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.8864469528198242


0it [00:00, ?it/s]




MAE_loss :  1.2565868



MAE_loss :  1.7183034



MAE_loss :  1.188488



MAE_loss :  2.4598422



MAE_loss :  1.4860849



MAE_loss :  1.8955243



MAE_loss :  1.1769825



MAE_loss :  1.1347753



MAE_loss :  1.4782833



MAE_loss :  1.4405215


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.7350174188613892


0it [00:00, ?it/s]




MAE_loss :  0.9790384



MAE_loss :  2.154814



MAE_loss :  1.8565398



MAE_loss :  3.0860956



MAE_loss :  2.0725493



MAE_loss :  0.92880845



MAE_loss :  1.1011444



MAE_loss :  0.87128687



MAE_loss :  0.78401875



MAE_loss :  0.78088254


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.6513583660125732


0it [00:00, ?it/s]




MAE_loss :  3.2978644



MAE_loss :  1.5080223



MAE_loss :  0.87967825



MAE_loss :  0.8032442



MAE_loss :  1.9589887



MAE_loss :  2.0689054



MAE_loss :  0.9425156



MAE_loss :  0.97126263



MAE_loss :  0.87072164



MAE_loss :  1.2588458


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.662126898765564


0it [00:00, ?it/s]




MAE_loss :  0.8977678



MAE_loss :  0.7673399



MAE_loss :  3.87243



MAE_loss :  1.3951871



MAE_loss :  1.0580425



MAE_loss :  1.0015527



MAE_loss :  1.1818097



MAE_loss :  0.7887384



MAE_loss :  0.8195212



MAE_loss :  0.9200369


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.590947151184082


0it [00:00, ?it/s]




MAE_loss :  1.0317024



MAE_loss :  0.64632225



MAE_loss :  0.91674155



MAE_loss :  1.1381388



MAE_loss :  0.92656887



MAE_loss :  1.8979282



MAE_loss :  1.130152



MAE_loss :  0.8331032



MAE_loss :  1.6856294



MAE_loss :  2.6215806


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.709158182144165


0it [00:00, ?it/s]




MAE_loss :  1.132261



MAE_loss :  0.9268568



MAE_loss :  1.0691562



MAE_loss :  1.5205935



MAE_loss :  0.7395958



MAE_loss :  2.6691656



MAE_loss :  1.3963048



MAE_loss :  0.8274231



MAE_loss :  1.5689931



MAE_loss :  1.2344961


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.67898428440094


0it [00:00, ?it/s]




MAE_loss :  2.6098223



MAE_loss :  0.97541964



MAE_loss :  2.3729603



MAE_loss :  1.9074953



MAE_loss :  2.727512



MAE_loss :  1.3314059



MAE_loss :  3.330239



MAE_loss :  0.921199



MAE_loss :  1.7338966



MAE_loss :  1.5587275


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 1.6335718631744385


In [6]:
test_set = pd.read_csv('./drive/MyDrive/open_224/test_dataset/test_data.csv')
test_set['l_root'] = test_set['before_file_path'].map(lambda x: './drive/MyDrive/open_224/test_dataset/' + x.split('_')[1] + '/' + x.split('_')[2])
test_set['r_root'] = test_set['after_file_path'].map(lambda x: './drive/MyDrive/open_224/test_dataset/' + x.split('_')[1] + '/' + x.split('_')[2])
test_set['before_file_path'] = test_set['l_root'] + '/' + test_set['before_file_path'] + '.png'
test_set['after_file_path'] = test_set['r_root'] + '/' + test_set['after_file_path'] + '.png'

test_dataset = KistDataset(test_set, is_test=True)
test_data_loader = DataLoader(test_dataset,
                               batch_size=64)

In [7]:
test_value = []
with torch.no_grad():
    for test_before, test_after in tqdm(test_data_loader):
        test_before = test_before.to(device)
        test_after = test_after.to(device)
        logit = model(test_before, test_after)
        value = logit.squeeze(1).detach().cpu().float()
        
        test_value.extend(value)

  0%|          | 0/62 [00:00<?, ?it/s]

In [8]:
# submission 형식을 불러온다.
submission = pd.read_csv('./drive/MyDrive/open_224/sample_submission.csv')

# 예측한 값들은 텐서 형태로 변환 시켜준다.
predict = torch.FloatTensor(test_value)

# 음수의 값을 갖는 모든 값들을 1 Day 차이가 발생하도록 바꿔줌
temp_predict = predict.numpy()
temp_predict[np.where(temp_predict<1)] = 1

# 모델의 예측 값을 저장함
submission['time_delta'] = temp_predict
submission.to_csv('baseline_224_v2.csv', index=False)