# Intro
Inference notebook for [Hotel-ID starter - similarity - training](https://www.kaggle.com/code/michaln/hotel-id-starter-similarity-training)

Using model and embeddings from the training notebook to generate embeddings for test data and find similar images.

# Setup

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

# Imports

In [None]:
import numpy as np
import pandas as pd
import random
import os
import math

In [None]:
from PIL import Image as pil_image
from tqdm import tqdm

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.models as models

import timm
from sklearn.metrics.pairwise import cosine_similarity

# Global

In [None]:
SEED = 42
IMG_SIZE = 256
N_MATCHES = 5

PROJECT_FOLDER = "../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/"
TRAIN_DATA_FOLDER = "../input/hotelid-2022-train-images-256x256/images/"
TEST_DATA_FOLDER = PROJECT_FOLDER + "test_images/"
#TEST_DATA_FOLDER = PROJECT_FOLDER + "train_images/100055/"
TRAIN_DATA_FOLDER = "../input/hotelid-2022-train-images-256x256/images/"

In [None]:
print(os.listdir(PROJECT_FOLDER))

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# Dataset and transformations

In [None]:
import albumentations as A
import albumentations.pytorch as APT
import cv2 

base_transform = A.Compose([
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

In [None]:
def pad_image(img):
    w, h, c = np.shape(img)
    if w > h:
        pad = int((w - h) / 2)
        img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=0)
    else:
        pad = int((h - w) / 2)
        img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value=0)
        
    return img


def open_and_preprocess_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = pad_image(img)
    return cv2.resize(img, (IMG_SIZE, IMG_SIZE))

In [None]:
class HotelImageDataset:
    def __init__(self, data, transform=None, data_folder="train_images/"):
        self.data = data
        self.data_folder = data_folder
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]
        image_path = self.data_folder + record["image_id"]
        
        image = np.array(open_and_preprocess_image(image_path)).astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image)
            image = transformed["image"]
        
        return {
            "image" : image,
        }

# Model

In [None]:
class EmbeddingModel(nn.Module):
    def __init__(self, n_classes=100, embedding_size=64, backbone_name="efficientnet_b1"):
        super(EmbeddingModel, self).__init__()
        
        self.backbone = timm.create_model(backbone_name, num_classes=n_classes, pretrained=False)
            
        if backbone_name == "inception_v3":
            in_features = self.backbone.get_classifier().out_features
        else:
            in_features = self.backbone.get_classifier().in_features
        
        self.backbone.classifier = nn.Identity()
        self.embedding = nn.Linear(in_features, embedding_size)
        self.classifier = nn.Linear(embedding_size, n_classes)

    def embed_and_classify(self, x):
        x = self.forward(x)
        return x, self.classifier(x)

    def forward(self, x):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        x = self.embedding(x)
        return x

# Model helper functions

In [None]:
def get_distances(test_embeds, base_embeds, model_array, image_index):
    distances = None
    for i, model in enumerate(model_array):
        model_input_embeds = test_embeds[i][image_index]
        model_base_embeds = base_embeds[i]["embeddings"].values
        
        output_distances = cosine_similarity([model_input_embeds], list(model_base_embeds))[0]
    
#         print(f"Output distances for model {i}:")
#         for i,d in enumerate(output_distances[:5]):
#             print(f"{i}) {d}")
        
        if distances is None:
            distances = output_distances
        else:
            distances = distances + output_distances
        
#     print(f"Output distances combined:\n")
#     for i,d in enumerate(distances[:5]):
#         print(f"{i}) {d}")
    
    return distances

def generate_embeddings(args, loader, model, bar_desc="Generating embeds"):
    outputs_all = []
    
    model.eval()
    with torch.no_grad():
        t = tqdm(loader, desc=bar_desc)
        for i, sample in enumerate(t):
            input = sample['image'].to(args.device)
            output = model(input)
            output = output.detach().cpu().numpy()
            outputs_all.extend(output)
            
    return outputs_all

In [None]:
def find_matches(distances, base_targets, k=N_MATCHES):
    distance_df = pd.DataFrame(index=np.arange(len(base_targets)), data={"hotel_id": base_targets})
    # calculate cosine distance of query embeds to all base embeds
    distance_df["distance"] = distances
    
    # sort by distance and hotel_id
    distance_df = distance_df.sort_values(by=["distance", "hotel_id"], ascending=False).reset_index(drop=True)
    print(f"distance_df after sorting: {distance_df}")
    
    # return first 5 different hotel_id_codes
    return distance_df["hotel_id"].unique()[:N_MATCHES]

def predict(args, base_embeddings_list, test_loader, model_list, bar_desc="Generating embeds"):    
    test_embeds = []
    # For every model, generate embeddings for every image
    for model in model_list:
        model_embeds = generate_embeddings(args, test_loader, model, "Generate test embeddings")
        test_embeds.extend([model_embeds])
    
    preds = []
    # For every image, calculate the distances for every model-embedding pair
    for image_index in range(len(test_embeds[0])):
        #distances = get_distances(test_embeds, base_embeddings_list, model_list, image_index)
        preds_for_image = find_matches(distances, base_embeddings_list[0]["hotel_id"].values)
        preds.extend([preds_for_image])
        
    return preds

# Prepare data

In [None]:
test_df = pd.DataFrame(data={"image_id": os.listdir(TEST_DATA_FOLDER), "hotel_id": ""}).sort_values(by="image_id")

# Prepare model

In [None]:
def get_model(backbone_name, checkpoint_path, args):
    model = EmbeddingModel(args.n_classes, args.embedding_size, backbone_name)
        
    checkpoint = torch.load(checkpoint_path)
    #checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint["model"])
    model = model.to(args.device)
    
    return model

In [None]:
class args:
    batch_size = 64
    num_workers = 2
    embedding_size = 128
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    
seed_everything(seed=SEED)

test_dataset = HotelImageDataset(test_df, base_transform, data_folder=TEST_DATA_FOLDER)
test_loader  = DataLoader(test_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

In [None]:
base_embeddings_list = [
    pd.read_pickle('../input/hotelidstarter/efficientnet_b2-256x256-embeddings.pkl'),
    pd.read_pickle('../input/hotelidstarter/densenet169-256x256-embeddings.pkl'),
    pd.read_pickle('../input/hotelidstarter/inception_v3-256x256_image-embeddings.pkl')
    ]
# display(base_embeddings_df.head())

In [None]:
args.n_classes = base_embeddings_list[0]["hotel_id"].nunique()

model1 = get_model("efficientnet_b2",
                  "../input/hotelidstarter/efficientnet_b2-256x256-checkpoint-epoch_10-acc_0.2509.pt", 
                  args)
model2 = get_model("densenet169",
                  "../input/hotelidstarter/densenet169-256x256-checkpoint-epoch_12-acc_0.2171.pt", 
                  args)
model3 = get_model("inception_v3",
                  "../input/hotelidstarter/inception_v3-256x256-checkpoint-epoch_13-acc_0.1955.pt", 
                  args)
model_list = [model1, model2, model3]

In [None]:
base_embeddings_list[0].head()

# Train Ensemble

In [None]:
class HotelTrainDataset:
    def __init__(self, data, transform=None, data_path="train_images/"):
        self.data = data
        self.data_path = data_path
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]
        #if USE_ORIGINAL_DATA:
        #image_path = self.data_path + str(record["hotel_id"]) + "/" + record["image_id"]
        #else:
        image_path = self.data_path + record["image_id"]
        image = np.array(pil_image.open(image_path)).astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image)
            image = transformed["image"]
        
        return {
            "image" : image,
            "target" : record['hotel_id_code'],
        }

class MetaModel(nn.Module):
    def __init__(self, n_models=3, n_classes=100):
        super(MetaModel, self).__init__()
        
        self.linear = nn.Linear(n_models, n_classes)

    def forward(self, x):
        x = self.linear(x)
        return x

In [None]:
def train_epoch(args, model, embeddings, criterion, optimizer, scheduler, epoch, num_batch_steps=1):
    losses = []
    targets_all = []
    outputs_all = []
    
    model.train()    
    loss_sum = 0
    for i, sample in enumerate(embeddings):        
        images = sample['image'].to(args.device)
        targets = sample['target'].to(args.device)
        
        outputs = model(images)
        
        loss = criterion(outputs, targets)/num_batch_steps        
        loss.backward()
        loss_sum += loss
        
        if (i+1)%num_batch_steps==0:
            losses.append(loss_sum.item())
            
            optimizer.step()
            optimizer.zero_grad()
        
            if scheduler:
                scheduler.step()                
            
            targets_all.extend(targets.cpu().numpy())
            outputs_all.extend(torch.sigmoid(outputs).detach().cpu().numpy())

            score = np.mean(targets_all == np.argmax(outputs_all, axis=1))
            desc = f"Training epoch {epoch}/{args.epochs} - loss:{loss_sum:0.4f}, accuracy: {score:0.4f}"
            t.set_description(desc)
            
            loss_sum = 0
        
    return np.mean(losses), score
    
def train_meta_learner(args, data_df):
    model = MetaModel(3, args.n_classes)
    model = model.to(args.device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
                    optimizer,
                    max_lr=args.lr,
                    epochs=args.epochs,
                    steps_per_epoch=len(valid_loader),
                    div_factor=10,
                    final_div_factor=1,
                    pct_start=0.1,
                    anneal_strategy="cos",
                )
    
    start_epoch = 1
    
    for epoch in range(start_epoch, args.epochs+1):
        val_loss, val_score = train_epoch(args, model, valid_loader, criterion, optimizer, scheduler, epoch, args.num_batch_steps)

In [None]:
CSV_FILE = "../input/hotelid-2022-train-images-256x256/train.csv"

data_df = pd.read_csv(CSV_FILE)
# encode hotel ids
data_df["hotel_id_code"] = data_df["hotel_id"].astype('category').cat.codes.values.astype(np.int64)

class args:
    epochs = 20
    lr = 1e-3
    batch_size = 32
    num_batch_steps = 2
    num_workers = 2
    val_samples = 1
    embedding_size = 128
    backbone_name = "densenet169"
    n_classes = data_df["hotel_id_code"].nunique()
    device = ('cuda' if torch.cuda.is_available() else 'cpu')

class meta_args:
    epochs = 5
    lr = 1e-3
    n_classes = data_df["hotel_id_code"].nunique()
    device = ('cuda' if torch.cuda.is_available() else 'cpu')

val_samples = 1

hotel_image_count = data_df.groupby("hotel_id")["image_id"].count()
# hotels that have more images than samples for validation
valid_hotels = hotel_image_count[hotel_image_count > val_samples]
# data that can be split into train and val set
valid_data = data_df[data_df["hotel_id"].isin(valid_hotels.index)]
# if hotel had less than required val_samples it will be only in the train set
valid_df = valid_data.groupby("hotel_id").sample(args.val_samples, random_state=SEED)

valid_dataset = HotelTrainDataset(valid_df, base_transform, data_path=TRAIN_DATA_FOLDER)
valid_loader  = DataLoader(valid_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

In [None]:
def get_val_distance(val_embeds, base_embeds):
    model_base_embeds = base_embeds["embeddings"].values
    # 1- if we do *
    #print(f"val_embeds len {len(val_embeds)}")
    #print(f"val_embeds[0] len {len(val_embeds[0])}")
    #print(f"model_base_embeds shape {model_base_embeds[0]}")
    distance = cosine_similarity(val_embeds, list(model_base_embeds))[0]    
    return distance

def find_val_matches(distances, base_targets, k=N_MATCHES):
    distance_df = pd.DataFrame(index=np.arange(len(base_targets)), data={"hotel_id": base_targets})
    # calculate cosine distance of query embeds to all base embeds
    distance_df["distance"] = distances
    
    # sort by distance and hotel_id
    distance_df = distance_df.sort_values(by=["distance", "hotel_id"], ascending=False).reset_index(drop=True)
    print(f"distance_df after sorting: {distance_df}")
    
    # return first 5 different hotel_id_codes
    return distance_df["distance"].unique()[:N_MATCHES]

In [None]:
val_embeds = []
# For every model, generate embeddings for every image
for model in model_list:
    model_embeds = generate_embeddings(args, valid_loader, model, "Generate val embeddings")
    val_embeds.extend([model_embeds])

In [None]:
# print(len(base_embeddings_list))
# print(len(base_embeddings_list[0]))
# print(valid_df)

image_embeddings = base_embeddings_list[0]["embeddings"].values
print(len(image_embeddings[0]))

In [None]:
image_distances = []
# For every image, calculate the distances for every model-embedding pair
for image_index in range(len(val_embeds[0])):
    distances = []
    for model_index in range(len(model_list)):
        distance = get_val_distance([val_embeds[model_index][image_index]], base_embeddings_list[model_index])
        distances.extend([distance])
    image_distances.extend([distances])

print(f"image distances shape: {image_distances.shape}")

# We expect image_distances shape (3700,3,40000) (3700,3) [40000]
# Output of trainer should be (40000)

# Combine the 3 distances -> NN
#train_meta_learner(meta_args, data_df)

# 
# preds_for_image = find_val_matches(distances, base_embeddings_list[0]["hotel_id"].values)
# preds.extend([preds_for_image])

# Submission

In [None]:
%%time

preds = predict(args, base_embeddings_list, test_loader, model_list)
# transform array of hotel_ids into string
test_df["hotel_id"] = [str(list(l)).strip("[]").replace(",", "") for l in preds]

test_df.to_csv("submission.csv", index=False)
test_df.head()