In [None]:
!wget https://postechackr-my.sharepoint.com/:u:/g/personal/dongbinna_postech_ac_kr/EXVy7_7pF5FIsPp6WfXXfWgBNfUKx8N1VrTisN8FbGYG9w?download=1 -O Flickr8k_dataset.zip

In [None]:
!unzip Flickr8k_dataset.zip -d ./Flickr8k_dataset

In [None]:
from google.colab import files
myfile=files.upload()

In [None]:
import os
import cv2

#이미지 데이터셋 불러오기
images_path = "./Flickr8k_dataset/Images"
#train, val, test 이미지 저장, 구글드라이브
images_train = '/content/gdrive/MyDrive/data/cv_test/train'
images_val = '/content/gdrive/MyDrive/data/cv_test/val'
images_test = '/content/gdrive/MyDrive/data/cv_test/test'
size=[256, 256]

#이미지 크기 재조정
def resizeImage(image, size):
  return cv2.resize(image, size)

#이미지 크기 재조정하여 train, val, test 폴더로 저장
if not os.path.exists(images_train):
  os.makedirs(images_train)
if not os.path.exists(images_val):
  os.makedirs(images_val)
if not os.path.exists(images_test):
  os.makedirs(images_test)

images = (os.listdir(images_path))
imageNum = len(images)
train_imageNum = 6000
val_imageNum = 1000

#train,val,test 이미지 나눠서 저장하기
for i, image in enumerate(images):
  if(i+1)<=train_imageNum:
    output=images_train
  if(i+1)<=train_imageNum+val_imageNum:
    output=images_val
  else:
    output=images_test
  with open(os.path.isfile(images_path)) as f:
    with open(f) as img:
      img = resizeImage(img, size)
      img.save(os.path.join(output, image), img.format)

In [None]:
import pickle
import nltk
from collections import Counter

nltk.download('punkt')

caption_path = "./Flickr8k_dataset/captions.txt"
vocab_path = "./vocab.pkl"

word_threshold = 5
train_caption_path = "./resized_train/captions.txt"
val_caption_path = "./resized_val/captions.txt"
test_caption_path = "./resized_test/captions.txt"


class Vocabulary(object):
  
    def __init__(self):
        self.word = {} #단어
        self.idx = {} #인덱스
        self.idx = 0

    def add_word(self, word):
        if not word in self.word:
            self.word[word] = self.idx
            self.idx[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word:
            return self.word['<unk>'] #단어가없다면
        return self.word[word]

    def __len__(self):
        return len(self.word)

counter = Counter()

with open(caption_path, "r") as f:
    lines = sorted(f.readlines()[1:])
    for i in range(len(lines)):
        line = lines[i]
        if (i + 1) <= train_imageNum * 5: #이미지당 캡션 5개
            output_caption = train_caption_path
        elif (i + 1) <= (train_imageNum + val_imageNum) * 5:
            output_caption = val_caption_path
        else:
            output_caption = test_caption_path
        index = line.find(",") #캡션시작
        caption = line[index + 1:] # 캡션(caption) 문자열 기록
        tokens = nltk.tokenize.word_tokenize(caption.lower()) #토큰화
        counter.update(tokens) #토큰갯수
        with open(output_caption, "a") as output_caption_f:
            output_caption_f.write(line)

# 단어의 빈도수가 5이상인 경우에만 사용
words = [word for word, cnt in counter.items() if cnt >= word_threshold]

vocab = Vocabulary()
vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')

for word in words:
    vocab.add_word(word)

with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)

In [None]:
import torch.utils.data as data


# Flickr8k 데이터셋
class Flickr8kDataset(data.Dataset):
    def __init__(self, root, captions, vocab, transform=None):
        self.root = root
        with open(captions, "r") as f:
             lines = f.readlines()
             self.captions = []
             for line in lines: # 첫 번째 줄부터 바로 캡션 정보 존재
                index = line.find(",") # 캡션(caption) 문자열의 시작점 찾기
                path = line[:index] # 이미지 파일 이름
                caption = line[index + 1:] # 캡션(caption) 문자열 기록
                self.captions.append((path, caption))
        self.vocab = vocab
        self.transform = transform

    # 이미지와 캡션 꺼내기
    def __getitem__(self, index):
        vocab = self.vocab
        path = self.captions[index][0]
        caption = self.captions[index][1]

        image = open(os.path.join(self.root, path)).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        # 캡션(caption) 문자열을 토큰 형태로 바꾸기
        tokens = nltk.tokenize.word_tokenize(str(caption).lower())
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        target = torch.Tensor(caption)
        return image, target

    def __len__(self):
        return len(self.captions)

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        # 사전 학습된(pre-trained) ResNet-101을 불러와 FC 레이어를 교체
        super(EncoderCNN, self).__init__()
        resnet = models.resnet101(pretrained=True)
        modules = list(resnet.children())[:-1] # 마지막 FC 레이어를 제거
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size) # 결과(output) 차원을 임베딩 차원으로 변경
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        # 입력 이미지에서 특징 벡터(feature vectors)
        with torch.no_grad(): # 네트워크의 앞 부분은 변경되지 않도록 하기
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features


class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        # 하이퍼 파라미터(hyper-parameters) 설정 및 레이어 생성
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length