In [1]:
import torch.nn as nn
import numpy as np
from torchvision import models, transforms
from sklearn import preprocessing
import os
import torch
from text_cnn_model import TextClassification
import joblib
import time
import copy
import datetime
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
from torch.optim import lr_scheduler


In [21]:
# LOAD THE IMAGE MODEL

# load the trained RESNET-50 model
model_img = models.resnet50(pretrained=True)

# defined the full model that was saved and load it 
num_ftrs = model_img.fc.in_features
model_img.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Linear(256, 13),
)
# load the model to verify a one to one match
save_dir_path = os.path.join('/notebooks', 'models', 'best_image_cnn_model.pt')
model_img.load_state_dict(torch.load(save_dir_path))

# define a new classification model by droping the last linear layer
model_img.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
)
# load the saved data to the new model
save_dir_path = os.path.join('/notebooks', 'models', 'best_image_cnn_model.pt')
model_img.load_state_dict(torch.load(save_dir_path), strict=False)

# freeze all params
for param in model_img.parameters():
    param.requires_grad = False

In [22]:
# LOAD THE TEXT MODEL
EMBD_DIM = 128
# defined the full model that was saved and load it
model_txt = TextClassification(
    word_embd_dim=EMBD_DIM, num_classes=13, word_kernel_size=3)

# get the current classification layers
linear_layers = list(model_txt.children())[-1]
# define the new classification layers by droping the last liner layer
new_linear_layers = nn.Sequential(*list(linear_layers.children())[:-1])
# redefine the model by replacing the classification layers
model_txt.linear_layer = new_linear_layers

# load the model 
save_dir_path = os.path.join('/notebooks', 'models', 'best_text_cnn_model.pt')
model_txt.load_state_dict(torch.load(save_dir_path),strict=False)

# freeze all params
for param in model_txt.parameters():
    param.requires_grad = False

In [32]:
class CombinedImageTextModel(nn.Module):
    def __init__(self, image_model, text_model, num_classes):
        super().__init__()
        self.image_model = image_model
        self.text_model = text_model
        self.num_classes = num_classes
        self.combined_layer = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(in_features=512, out_features=self.num_classes),
            )

    def forward(self, ip_img, ip_txt):
        op_img = self.image_model(ip_img)
        op_txt = self.text_model(ip_txt)
        op = torch.concat((op_img, op_txt), dim=1)
        return self.combined_layer(op)

In [6]:
# prepare the target data
# load saved train and val labels
train_pklname = os.path.join('/notebooks/data', 'train_txt_labels.pkl')
train_txt_labels = joblib.load(train_pklname)

val_pklname = os.path.join('/notebooks/data', 'val_txt_labels.pkl')
val_txt_labels = joblib.load(val_pklname)

train_descs_idx = joblib.load(os.path.join(
    '/notebooks/data', 'train_descs_idx.pkl'))

val_descs_idx = joblib.load(os.path.join(
    '/notebooks/data', 'val_descs_idx.pkl'))

# create ordinal encoded train target
label_train = np.array(train_txt_labels).reshape(-1, 1)
target_enc = preprocessing.OrdinalEncoder()
target_enc.fit(label_train)
target_train = target_enc.transform(label_train)
target_train = target_train.reshape(-1)

# create ordinal encoded val target
label_val = np.array(val_txt_labels).reshape(-1, 1)
target_val = target_enc.transform(label_val)
target_val = target_val.reshape(-1)


In [7]:
# prepare the image input data

data_path = os.path.join('/notebooks/data/', 'img_prepared')

# load the prepared data (prepare_image_data.py)
train_pklname = data_path + '_train.pkl'
val_pklname = data_path + '_val.pkl'
train_data = joblib.load(train_pklname)
val_data = joblib.load(val_pklname)

# get train input data
train_img = np.array(train_data['data'])
# get val input data
val_img = np.array(val_data['data'])

# a transform composition 
img_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# prepare the text input data
# load embedded train text data
train_words_emb = joblib.load(os.path.join(
    '/notebooks/data/', 'train_words_emb.pkl'))
# load embedded val text data
val_words_emb = joblib.load(os.path.join(
    '/notebooks/data/', 'val_words_emb.pkl'))




  train_img = np.array(train_data['data'])
  train_img = np.array(train_data['data'])
  val_img = np.array(val_data['data'])
  val_img = np.array(val_data['data'])


In [8]:
class CombinedDataset(torch.utils.data.Dataset):
    def __init__(self, ip_img, ip_txt, labels, max_word_len, embd_dim, img_transform):
        self.embd_dim = embd_dim
        self.ip_img = ip_img
        self.ip_txt = ip_txt
        self.labels = labels
        self.max_word_len = max_word_len
        self.img_transform = img_transform

    def __getitem__(self, index):
        img = self.ip_img[index]
        img = self.img_transform(img)
        text = self.ip_txt[index]
        pad_len = self.max_word_len * self.embd_dim - len(text)
        assert(pad_len >= 0)
        pad_len_r = pad_len // 2
        pad_len_l = pad_len - pad_len_r
        pad_r = torch.zeros(pad_len_r)
        pad_l = torch.zeros(pad_len_l)
        text_padded = torch.concat([pad_l, text, pad_r])
        label = self.labels[index]
        return img, text_padded, label

    def __len__(self):
        return len(self.labels)


def train_model(model, criterion, optimizer, scheduler, writer, num_epochs=50, save_path=None):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for imgs, texts, labels in dataloaders[phase]:
                imgs = imgs.to(device)
                texts = texts.unsqueeze(1)
                texts = texts.to(device)
                labels = labels.type(torch.LongTensor)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(imgs, texts)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * imgs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            writer.add_scalar(f'Loss/{phase}', epoch_loss, epoch)
            writer.add_scalar(f'Accuracy/{phase}', epoch_acc, epoch)

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, os.path.join(
                    save_path, 'best_combined_model.pt'))

        print()
        writer.flush()

    time_elapsed = time.time() - since
    print(
        f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


In [9]:
# define train and val dataset

# find max description word length
max_train_desc_len = max(map(len, train_descs_idx))
max_val_desc_len = max(map(len, val_descs_idx))
max_desc_len = max_train_desc_len
if max_val_desc_len > max_train_desc_len:
    max_desc_len = max_val_desc_len

trainCombinedDataset = CombinedDataset(
    train_img, train_words_emb, target_train, max_desc_len, EMBD_DIM, img_transform)
valCombinedDataset = CombinedDataset(
    val_img, val_words_emb, target_val, max_desc_len, EMBD_DIM, img_transform)


datasets = {'train': trainCombinedDataset, 'val': valCombinedDataset}
dataset_sizes = {x: len(datasets[x]) for x in ['train', 'val']}


In [25]:
log_dir_path = os.path.join('/notebooks', 'runs')
save_dir_path = os.path.join('/notebooks', 'models')
try:
    os.makedirs(log_dir_path)
except:
    pass
try:
    os.makedirs(save_dir_path)
except:
    pass
# %load_ext tensorboard
# %tensorboard --logdir {log_dir_path}


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# log dir for storing tensorboard files
log_dir = os.path.join(
    log_dir_path, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
writer = SummaryWriter(log_dir=log_dir)

model = CombinedImageTextModel(model_img, model_txt, num_classes=13)
model = model.to(device)

criterion = nn.CrossEntropyLoss()

# SGD is the best optimizer
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

# 16 is the best batch size
BATCH_SIZE = 16

# create a dict of train and val dataloaders
dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=BATCH_SIZE,
                                              shuffle=True, num_workers=4)
               for x in ['train', 'val']}

# train the model
model_ft = train_model(model, criterion, optimizer_ft, exp_lr_scheduler, writer,
                       num_epochs=50, save_path=save_dir_path)

# experiments
# 2022-06-21_09-29-02:
# optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
# BATCH_SIZE = 16
# self.combined_layer = nn.Sequential(
#     nn.Dropout(0.5),
#     nn.Linear(in_features=512, out_features=self.num_classes),
# )

