## Use this Notebook as Inference

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import gc
import numpy as np

from models.popular_models import PopularModels
from models.lstm import LSTM
from torchvision import transforms
from tqdm import tqdm
from data.dataset import MultiLabelDataset
from tools.tools import get_data, load_data, tokenize, remove_class, count_class, calculate_pos_weights
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## Load the Data

In [2]:
DIR="../dataset/" # the file containing the "data" folder and .csv files

In [3]:
# import data
train_data = get_data(f"{DIR}/train.csv")
test_data = get_data(f"{DIR}/test.csv")

# perform text cleaning and get the pandas' dataframe
train_data = load_data(train_data)
test_data = load_data(test_data, has_label=False)

# remove an imbalanced class
train_data = remove_class(train_data, class_no=1)

# split into training and validating sets
X = train_data.iloc[:, 0:2]
y = train_data.iloc[:, 2:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_test, y_test], axis=1)

# join the data together
for_nlp_data = pd.concat((train_data['caption'], val_data['caption'], test_data['caption']), ignore_index=True)

In [4]:
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

(12740, 21)
(3185, 21)
(10000, 2)


In [5]:
print(f"Number of training instances: 	{train_data.shape[0]}")
print(f"Number of validation instances: {val_data.shape[0]}")
print(f"Number of testing instances:  	{test_data.shape[0]}")

Number of training instances: 	12740
Number of validation instances: 3185
Number of testing instances:  	10000


## Preprocessing for Images and Caption

In [6]:
# define the image transformation: currently following resnet18
transform = transforms.Compose([
	transforms.Resize((232, 232)),
	transforms.CenterCrop(224),
	transforms.ToTensor(), # converts images to [0, 1]
	transforms.Normalize(
		mean=[0.485, 0.456, 0.406],
		std=[0.229, 0.224, 0.225],
	)
])

# tokenize the data
final_list, vocab = tokenize(for_nlp_data)
X_train_vec = final_list[:train_data.shape[0], :]
X_val_vec = final_list[train_data.shape[0]:train_data.shape[0]+val_data.shape[0], :]
X_test_vec = final_list[train_data.shape[0]+val_data.shape[0]:, :]

print(final_list.shape)
print(X_train_vec.shape)
print(X_val_vec.shape)
print(X_test_vec.shape)

Max Sentence Length: 28
(25925, 28)
(12740, 28)
(3185, 28)
(10000, 28)


## Create Dataset and DataLoader

In [7]:
# initialize the dataset
train_dataset = MultiLabelDataset(
	csv_file=train_data,
	root_dir='../dataset/data/',
	vectorizer=None,
	transform=transform,
	use_caption_vec=True,
	caption_vec=X_train_vec,
	is_test=False,
)
val_dataset = MultiLabelDataset(
    csv_file=val_data,
    root_dir='../dataset/data/',
    vectorizer=None,
	transform=transform,
	use_caption_vec=True,
	caption_vec=X_val_vec,
	is_test=False,
)
test_dataset = MultiLabelDataset(
	csv_file=test_data,
	root_dir='../dataset/data/',
	vectorizer=None,
	transform=transform,
	use_caption_vec=True,
	caption_vec=X_test_vec,
	is_test=True,
)

BATCH_SIZE=16

# load the dataset into batches 
train_dataloader = DataLoader(
	dataset=train_dataset,
	batch_size=BATCH_SIZE,
	shuffle=True,
)
val_dataloader = DataLoader(
    dataset=val_dataset,
	batch_size=BATCH_SIZE,
	shuffle=True,
)
test_dataloader = DataLoader(
	dataset=test_dataset,
	batch_size=BATCH_SIZE,
	shuffle=False,
)

## Create the Combined Model

In [8]:
class CombinedModel(nn.Module):

	def __init__(
			self, 
			choice: str, 
			pretrained: bool, 
			freeze: bool, 
			cnn_n_out: int,
			no_layers: int,
			vocab_size: int,
			embedding_dim: int,
			lstm_hidden_dim: int,
			lstm_n_out: int,
			lstm_dropout: float = 0.5,
			fc_dropout: float = 0.5,
		) -> None:
		super(CombinedModel, self).__init__()

		# get the CNN model for image classification
		self.cnn_model = PopularModels(
			choice=choice,
			pretrained=pretrained,
			freeze=freeze,
			n_out=cnn_n_out,
		).get_model()

		# get the LSTM model for text classification
		self.lstm_model = LSTM(
			no_layers=no_layers,
			vocab_size=vocab_size + 1,
			embedding_dim=embedding_dim,
			hidden_dim=lstm_hidden_dim,
			output_dim=lstm_n_out,
			dropout=lstm_dropout,
		)
		
		# create the final fully connected layer
		self.last_layer = nn.Linear(cnn_n_out + lstm_n_out, 19)
		
		# prevent overfitting
		self.dropout_layer = nn.Dropout(p=fc_dropout, inplace=True)

	def forward(self, x, y, hidden):
		x = self.cnn_model(x)
		y, hidden = self.lstm_model(y, hidden)
		
		output = torch.cat((x, y), dim=1)
		output = self.dropout_layer(output)
		output = self.last_layer(output)

		return output, hidden

## Define the model

In [9]:
class_counts = list(count_class(train_data).values())
pos_weights = calculate_pos_weights(class_counts, train_data)
if torch.cuda.is_available():
	pos_weights = pos_weights.to('cuda')

In [10]:
EPOCHS = 20
THRESHOLD = 0.7
N_CLASSES = 19

model = CombinedModel(
	choice="regnet_x_1_6gf",
	pretrained=True, 		# if pretrained is False, then freeze should also be False
	freeze=True,
	cnn_n_out=256,
	no_layers=2,
	vocab_size=len(vocab), 	# already added by 1
	embedding_dim=64,
	lstm_hidden_dim=256,
	lstm_n_out=128,
    lstm_dropout=0,
	fc_dropout=0,
)
model.load_state_dict(torch.load('../regnet_lstm_model.pth'))

<All keys matched successfully>

In [11]:
# create a file for test submission
f = open('../submission_combined.csv', "w")
f.write("ImageID,Labels\n")

# utilise GPU
if torch.cuda.is_available():
	print('using GPU')
	model = model.to('cuda')

# initialize the hidden state
hidden = model.lstm_model.init_hidden(batch_size=BATCH_SIZE)

# idx = 0
model.eval()
for (image_names, images, captions) in test_dataloader:

	# creating new variables for the hidden state, otherwise
	# we'd backprop through the entire training history
	hidden = tuple([each.data for each in hidden])
	
	if torch.cuda.is_available():
		images = images.to('cuda')
		captions = captions.to('cuda')
	
	outputs, _ = model(images, captions, hidden)
	predicted = (F.sigmoid(outputs) > THRESHOLD).int()

	# NOTE: add 1 to the output of predicted!
	# write the output
	for i, predicted_label in enumerate(predicted):
		label = (predicted_label == torch.max(predicted_label)).nonzero().flatten()
		label += 1
		label = label.tolist()
		label = " ".join(str(x) for x in label)

		f.write(image_names[i].split("/")[-1] + "," + str(label) +"\n")

f.close()

using GPU
