# Implementation of [HUSE: Hierarchical Universal Semantic Embeddings](https://arxiv.org/pdf/1911.05978.pdf) in PyTorch
<hr></hr>
<img src="https://raw.githubusercontent.com/programmer290399/resources/master/HUSE.png">
<hr></hr>



## <ins>Introduction</ins>:
* We, human beings use multiple senses to understand and make sense of the world around us. We use sight, hearing, smell, taste and touch to interact and understand things.

* Taking input data of different form or say multiple modalities helps us understand and act better because data from different modalities share latent correlations. 

* To help computers understand we can use a similar approach called multi-modal learning.


* HUSE: Hierarchical Universal Semantic Embeddings paper proposes a multi-modal deep-learning method which enables multimodal em-beddings to share a common latent space.

* HUSE projects images and text into a shared latent space by using a shared classification layer for image and text modalities.

* HUSE incorporates semantic information by making the distance between any two universal embeddings to be similar to that of the distance between their class label embeddings in the semantic embedding space.

* One thing where this method stands out is it not only allow the embeddings corresponding to a semantic class to lie closer to each other than the embeddings corresponding to two different classes but also makes learned universal embedding space semantically meaningful by clustering related classes closer than the unrelated ones.

* A very good example of the above property can be found in Section 1 Para 3 of the paper.




In [0]:
! unzip /content/drive/My\ Drive/GreenDeck\ ML\ Assignment/images.zip
! pip install num2words nltk pytorch-pretrained-bert wget
! python -c "import nltk ; nltk.download('all')"
! pip install -U sentence-transformers

## Imports 
<hr></hr>
Here we import all the required packages

In [0]:
import wget
import torch
import pickle
import requests
import numpy as np
import progressbar
import pandas as pd
from torch import nn
from PIL import Image
from string import punctuation
from collections import Counter
from torch.optim import RMSprop
from num2words import num2words
import torch.nn.functional as F
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from torchvision import models, transforms
from itertools import product, combinations
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

## Config Class 
<hr></hr>
This class is used to set all the hyper parameters and other variable values to run the notebook.




In [0]:
class config():
  """
  A simple class for setting all the Hyperparameters and other configs 
  for running the notebook.
  """

  def __init__(self):

    # For ImageEmbeddingGen:
    # =====================
    # This model would be used to generate Image embeddings for
    # image tower you can see a complete list of models on :
    # https://pytorch.org/docs/stable/torchvision/models.html
    # NOTE : Please update the output dimensions if you change the model.
    self.ImageEmbdModel = models.vgg19(pretrained=True)
    self.ImageEmbd_out = (1,1000)


    # For TextEmbeddingGen:
    # ====================
    # This tokenizer and model would be used to generate Text embeddigs for
    # text tower you can see a complete list of models on :
    # https://pypi.org/project/pytorch-pretrained-bert/#doc
    # NOTE : Please update the output dimensions if you change the model.
    self.Tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
    self.TxtEmbdModel = BertModel.from_pretrained('bert-base-uncased')
    self.TxtEmbd_out = (1,768)


    # For TextTower:
    # =============
    # The parameters up here are exactly as mentioned in section 4.1.3 
    # of the paper , the magic number 11127 below is actually the dimension 
    # of the tf-idf vector which is concatenated with the bert output
    self.TxtTower_input = self.TxtEmbd_out[1] + 11127
    self.TxtTower_hidden = 512 
    self.TxtTower_dropout = 0.15
    self.TxtTower_output = 512


    # For ImageTower:
    # ==============
    # The parameters up here are exactly as mentioned in section 4.1.3
    self.ImgTower_input = self.ImageEmbd_out[1] 
    self.ImgTower_hidden = 512 
    self.ImgTower_dropout = 0.15
    self.ImgTower_output = 512


    # Data Paths:
    # ===========
    # The paths for the training data csv and images can be set below 
    self.csv_path = '/content/drive/My Drive/GreenDeck ML Assignment/training_data.csv'
    self.images_folder = '/content/content/netaporter_gb_images/'
    self.total_classes = 271 # As per the greendeck dataset provided.


    # For HUSEClassifier:
    # ===================
    # The paramerters up here are according to the description given in   
    # section 3.2 last para 
    self.Huse_input = self.ImgTower_output + self.TxtTower_output
    self.Huse_output = self.total_classes


    # Hyperparameters:
    # ================
    # Set as described in section 4.1.3
    self.lr = 1.6192e-05
    self.num_epoch = 15 
    self.batch_size = 32 
    self.momentum = 0.9
    self.α = 1/3  # Classification loss coeff. # Keeping all the three same for now 
    self.β = 1/3  # Semantic Similarity loss coeff.
    self.γ = 1/3  # Cross Modal loss coeff.
    self.ζ = 0.9  # Relaxation coeff.

    # Training Device:
    # ================ 
    self.device =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Download Semantic graph
    # =======================
    # Set the below variable True if you wish to download and load semantic 
    # graph from my github repo otherwise set it to false
    self.download_semantic_graph = True
# Created a config object for passing to all other classes later 
config = config()

few utility functions below

In [0]:
def save_obj(obj, name):
  """
  A simple method for saving a python object to a file for later use. 
  
  Parameters
  ----------
  obj : Any python object, Required 
    You can pass python objects like lists, dicts, etc.
  
  name : str, Required
    The pickle file of the object would be saved with this name  
  """

  with open( name + '.pkl', 'wb+') as f:
    pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
  """
  A simple method for loading a saved python object from a .pkl 
  file.

  Parameters
  ----------
  name : str, Required 
    The name of the saved pickle file without the ".pkl" at the end.

  Returns
  -------
  Python object 
    The python object is loaded from the provided filename and 
    returned as it is.
  """

  with open(name + '.pkl', 'rb') as f:
    return pickle.load(f)

def get_semantic_graph(classes_list, Download=config.download_semantic_graph):
  """
  A function for getting semantic graph for later use.

  Parameters
  ----------
  classes_list : iterable(list/numpy array), Required
    This should contain all the N classes
  Download : bool, optional
    If set to true , the function downloads the semantic graph and semantic 
    graph map from my github repo to save computation time.

  Returns
  -------
  semantic_graph : torch.tensor[N,N]
    The semantic graph is actually the adjacency matrix where each element on 
    row i and column j represents the cosine distance between the embeddings 
    of i th and j th classes.
  classes_combs_map : dict
    This dict contains the mapping of class name combinations and the indices 
    where we can find the cosine distance corresponding to the combination in 
    the semantic_graph 
  """

  if Download :

    urls = ['https://github.com/programmer290399/resources/raw/master/Semantic_graph.pth',
            'https://github.com/programmer290399/resources/raw/master/semantic_graph_map_new.pkl']
    
    for url in urls:
      wget.download(url)
    
    semantic_graph = torch.load('Semantic_graph.pth',map_location=torch.device(config.device))
    class_comb_map = load_obj('semantic_graph_map_new')
  
  else :
    
    semantic_graph, class_comb_map = SemanticGraph(classes_list) 

  return semantic_graph, class_comb_map


## PART1: CREATING TEXT AND IMAGE EMBEDDINGS INPUTS:
<hr></hr>



### <ins>Image Embeddings Input Model</ins>:

<img src="https://github.com/programmer290399/resources/raw/master/image_tensor2Univ_embdpng.png"/>
<p>Fig 1: Process of creating universal embeddings from Image tensor</p>
<hr>

* To generate universal embedding for image HUSE uses a Backbone Image Network to convert Image Tensors to an intermediate embedding 

* This intermediate embedding is nothing but the output of the backbone network which is fed to Image Tower which generates the Universal Embeddings

* The class ImageEmbeddingGen (given below) accomplishes the work of the backbone image network here as depicted by second block in the image above(Fig 1).

* The model I've used here gives output embeddings of shape 1X1000 for an input image tensor


In [0]:
class ImageEmbeddingGen():
  """
  A class to get Image embeddings from the desired model 
  for feeding to the ImageTower 

  ...
  
  Attributes
  ----------
  model : torchvision.models.model
    The model from which the embeddings would be fetched

  Methods
  -------
  get_ImgEmbd(image)
    Returns the image embedding of the passed image. 
  """

  def __init__(self,config):
    """
    Parameters
    ----------
    config : config class object
      The model to be used would be fetched from config
    """

    self.model = config.ImageEmbdModel
    # Freezing model parameters and putting it in eval mode 
    for param in self.model.parameters():
      param.requires_grad_(False)
    self.model.eval()
    self.model.to(config.device)
  
  def get_ImgEmbd(self,image):
    """
    Parameters
    ----------
    image : torch.tensor, Required 
      The normalised image tensor for feeding to the 
      model. 

    Returns 
    -------
    ImgEmbd : torch.tensor[1,1000]
      The image embding generated by the model. 
    """

    ImgEmbd = self.model(image)
    return ImgEmbd

### <ins>Text Embeddings Input Model</ins>:

<img src="https://github.com/programmer290399/resources/raw/master/text2Univ_embdpng.png"/>
<p>Fig 2: Process of creating universal embeddings from Text</p>
<hr>

* To generate universal embedding for text HUSE uses a Backbone text Network to convert Text to an intermediate embedding. 

* This intermediate embedding is nothing but the output of the backbone network which is fed to Text Tower which generates the Universal Embeddings.

* The class TextEmbeddingGen (given below) accomplishes the work of the backbone text network here as depicted by second block in the image above.(Fig 2)

* We have used bert as the backbone text network and have mean pooled the ouput of last 4 layers.

* The input actually fed to the Text Tower also contains the tf-idf vector of the text which is concatenated with the output of the backbone text network and then fed to it.

* This network outputs embedding of shape 1X768 for a text input.


In [0]:
class TextEmbeddingGen():
  """
  A class to get Text embeddings from the desired model 
  for feeding to the TextTower

  ...

  Attributes
  ----------
  tokenizer : obj 
    The bert tokenizer which tokenizes input text for futher processing
  model : torch.nn.Module
    The bert model for generating text embeddings 

  Methods
  -------
  get_TxtEmbd_bert(text)
    Returns the textembding generated mean pooling the output of last 4 layers
    of the bert model
  """

  def __init__(self,config):
    """
    Parameters
    ----------
    config : config class object
      The model to be used would be fetched from config
    """

    self.tokenizer = config.Tokenizer_bert
    self.model = config.TxtEmbdModel
    # Freezing model parameters and putting it in eval mode
    for param in self.model.parameters():
      param.requires_grad_(False)
    self.model.eval()
    self.model.to(config.device)
    
  def get_TxtEmbd_bert(self,text):
    """
    Parameters
    ----------
    text : str, Required 
      The input text for which we need to generate embeddings
    
    Returns
    -------
    mean_pooled_last_4_tokens : torch.tensor[1,768]
      The mean pooled output of last 4 layers.
    """

    # Tokenizing the input text and generating token and segment IDs
    tokenized_text = self.tokenizer.tokenize("[CLS] " +text+ " [SEP]" )
    indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    
    # Converting indexes tokens and segment IDs to tensors for further 
    #operations
    tokens_tensor = torch.tensor([indexed_tokens]).to(config.device)
    segments_tensors = torch.tensor([segments_ids]).to(config.device)
    
    # Getting the output from the model
    with torch.no_grad():
      encoded_layers, _ = self.model(tokens_tensor, segments_tensors)
    
    # Here we stack the layers and mean pool them 
    last_4_layer_out = torch.stack(encoded_layers[-4:])
    meanpooled_last_4 = torch.squeeze(torch.mean(last_4_layer_out, dim=0))
    mean_pooled_last_4_tokens = torch.mean(meanpooled_last_4,dim=0)
    mean_pooled_last_4_tokens = mean_pooled_last_4_tokens.reshape(1,768)
    return mean_pooled_last_4_tokens


## PART2: MODEL IMPLEMENTATION FOR CREATING FINAL EMBEDDINGS:
<hr></hr>



### <ins>Text Tower Model</ins>:

* This model takes text embedding generated by the backbone text network with the tf-idf vector of the same text to generate universal embeddings for it.

* This is depicted by third block in Fig 2.

* This class, TextTower generates universal embeddings of shape 1X512, though this can be changed in the config class definition.(as quoted in section 3.2 para 2 of the paper)(given below)

  >"the backbone text network
followed by text tower corresponds to an text projection
function φ T (·) ∈ R D that projects text to D-dimensional
universal embedding space."


* It is structured exactly as described in section 4.1.3 of the paper.

In [0]:
class TextTower(nn.Module):
  """
  A class for taking text embeddings and converting them to a fixed D 
  dimensional universal embedding.

  ...

  Attributes
  ----------
  input_size : int 
    The input size of the tower 
  hidden_size : int 
    The size of the hidden layer
  output_size : int 
    Output dimension as required 
  dropout : float 
    The dropout prob to be used 
  fc_inp : torch.nn.Linear
    The fully connected layer which takes input embedding 
  fc_out : torch.nn.Linear
    The fully connected layer which gives output embedding 

  Methods
  -------
  forward(input_embd)
    Automatically called when we pass text embedding to class obj, 
    returns universal embedding of the text embedding passed.  
  """

  def __init__(self, config):
    """
    Parameters
    ----------
    config : config class object
      All the attricutes are fetched from this object
    """

    super().__init__()
    self.input_size  = config.TxtTower_input
    self.hidden_size = config.TxtTower_hidden
    self.output_size = config.TxtTower_output
    self.dropout     = nn.Dropout(p=config.TxtTower_dropout)
    self.fc_inp      = nn.Linear(self.input_size, self.hidden_size)
    self.fc_out      = nn.Linear(self.hidden_size,self.output_size)

  def forward(self, input_embd):
    """
    Parameters 
    ----------
    input_embd : torch.tensor[1, input_size], Required 
      This is the concatenation of the bert embeddings and the tf-idf vector 

    Returns
    -------
    output : torch.tensor[1,output_size]
      The universal embedding of the text.
    """

    x = self.dropout(F.relu(self.fc_inp(input_embd)))
    output = F.normalize(F.relu(self.fc_out(x)))
    
    return output


### <ins>Image Tower Model</ins>:

* This model takes image embedding generated by the backbone image network to generate universal embeddings for it.

* This is depicted by third block in Fig 1.

* This class, ImageTower generates universal embeddings of shape 1X512, though this can be changed in the config class definition.(as quoted in section 3.2 para 1 of the paper)(given below) 

  >"HUSE consists of an image tower that returns universal
  embeddings corresponding to an image."

  >"the backbone image net-
work followed by the image tower corresponds to the image
projection function φ I (·) ∈ R D that projects an image to
D-dimensional universal embedding space."

* It is structured exactly as described in section 4.1.3 of the paper.

In [0]:
class ImageTower(nn.Module):
  """
  A class for taking Image embeddings and converting them to a fixed D 
  dimensional universal embedding.

  ...

  Attributes
  ----------
  input_size : int 
    The input size of the tower 
  hidden_size : int 
    The size of the hidden layer
  output_size : int 
    Output dimension as required 
  dropout : float 
    The dropout prob to be used 
  fc_inp : torch.nn.Linear
    The fully connected layer which takes input embedding
  fc_hidden_1 : torch.nn.Linear
    Fully connected hidden layer
  fc_hidden_2 : torch.nn.Linear
    Fully connected hidden layer 
  fc_hidden_3 : torch.nn.Linear
    Fully connected hidden layer   
  fc_out : torch.nn.Linear
    The fully connected layer which gives output embedding 

  Methods
  -------
  forward(input_embd)
    Automatically called when we pass image embedding to class obj, 
    returns universal embedding of the image embedding passed.  
  """

  def __init__(self,config):
    """
    Parameters
    ----------
    config : config class object
      All the attricutes are fetched from this object
    """

    super().__init__()
    self.input_size  = config.ImgTower_input
    self.hidden_size = config.ImgTower_hidden
    self.output_size = config.ImgTower_output
    self.dropout     = nn.Dropout(p=config.ImgTower_dropout)
    self.fc_inp      = nn.Linear(self.input_size, self.hidden_size)
    self.fc_hidden_1 = nn.Linear(self.hidden_size, self.hidden_size)
    self.fc_hidden_2 = nn.Linear(self.hidden_size, self.hidden_size)
    self.fc_hidden_3 = nn.Linear(self.hidden_size, self.hidden_size)
    self.fc_out      = nn.Linear(self.hidden_size,self.output_size)
  
  def forward(self, input_embd):
    """
    Parameters 
    ----------
    input_embd : torch.tensor[1, input_size], Required 
      The image embding for which we need universal embedding.

    Returns
    -------
    output : torch.tensor[1,output_size]
      The universal embedding of the image.
    """

    x = self.dropout(F.relu(self.fc_inp(input_embd)))
    x = self.dropout(F.relu(self.fc_hidden_1(x)))
    x = self.dropout(F.relu(self.fc_hidden_2(x)))
    x = self.dropout(F.relu(self.fc_hidden_3(x)))
    output = F.normalize(F.relu(self.fc_out(x)))

    return output

### <ins>Shared Classification Layer</ins>:
<img src="https://github.com/programmer290399/resources/raw/master/classifier_layer.png"/>
Fig 3: Process of classification from universal embeddings of text and image
<hr></hr>

* This layer takes in the universal embeddings and classifies them into a target class 
* This is made as per the specifications in section 3.2 para 4.
* The process is clearly depicted in the image above fig 3.
>NOTE : It was a bit unclear to me that how the embeddings would be fed to the classification layer , there are two possible ways, one is which I have implemented and shown above in the image, the other is when we feed them one by one and take average of the logits and then classify. 


In [0]:
class HUSEClassifier(nn.Module):
  """
  A shared classification layer for the universal embeddings. 

  ...

  Attributes
  ----------
  input_size : int 
    The input size of the layer
  output_size : int 
    The input size of the layer
  shared_layer : torch.nn.Linear
    The fully connected layer for classification

  Methods
  -------
  forward(input_embd)
    Automatically called when we pass Universal embedding to class obj, 
    returns classification of the Universal embedding passed.
  """

  def __init__(self, config):
    """
    Parameters
    ----------
    config : config class object
      All the attributes are fetched from this object
    """

    super().__init__()
    self.input_size = config.Huse_input
    self.output_size = config.Huse_output
    self.shared_layer = nn.Linear(self.input_size,self.output_size)
  
  def forward(self,input_embd):
    """
    Parameters 
    ----------
    input_embd : torch.tensor[1, input_size], Required 
      The universal embding for which we need classification.

    Returns
    -------
    output : torch.tensor[1,output_size]
      The classification output.
    """

    output = self.shared_layer(input_embd)
    
    return output


## PART3: INCORPORATING THREE  LOSSES INTO THE ARCHITECTURE:
<hr></hr>





### <ins>Class Level Similarity and Classification Loss</ins>:

* This is perhaps the simplest and quite common loss type, here I've used the softmax cross entropy loss as described in 3.4.1 of the paper.
* This loss is implemented using nn.CrossEntropyLoss from pytorch
* This loss tries to can cluster the embeddings corresponding to a class together.

### <ins>Semantic Similarity, Semantic Graph, and Semantic Loss</ins>:

#### <ins>Semantic Graph</ins>:
<img src="https://github.com/programmer290399/resources/raw/master/semantic_graph_illustration.jpeg" width="500" height="500"/>
<p>Fig 4: Semantic Graph Visualization</p>
<hr></hr>

* The semantic graph is basically a Adjacency matrix which can be visualized very well in Fig 4 above.
* Say we have five classes, Class A, Class B, Class C, Class D and Class E.
  So in the semantic graph these classes form the vertices of the graph and each vertice is connected to all other vertices. 

* The edges connecting any two classes are weighted with the distance(cosine distance in our case) between the embeddings of the class names.

* Say Class A has an index i in the class list and Class B has index j. Then the semantic graph's value on i,j would be the distance between the embeddings of the class names.

* More formally as stated in the paper section 3.3:
>"we define the semantic graph as G =
(V, E), where V = {v 1 , v 2 , ..., v K } represents the set of K
classes and E represent the edges between any two classes.
Let ψ(·) represent the function that extracts embeddings of a
class name."

* The element A[i][j] is described as by equation (4) in section 3.3 of the paper.


<img src="https://github.com/programmer290399/resources/raw/master/semantic_compute_illustration.jpeg" width="600" height="400"/>
<p>Fig 5: Semantic Graph Visualization</p>
<hr></hr>

* The illustration above in Fig 5 shows how we compute semantic similarity between our classes.
* We split the class name into appropriately into sub tokens 
* Then we generate their respective embeddings using bert 
* Then we take the mean of their embeddings
* Finally we compute the cosine similarity between the two and
* Assign it to the designated position in our semantic graph


#### <ins>Semantic Loss</ins>:
<img src="https://github.com/programmer290399/resources/raw/master/semantic_loss.jpeg" width="800" height="700"/>
<p>Fig 6: Semantic Loss Computation</p>
<hr></hr>

* Above you can see in Fig 5 the process of semantic loss computation which was stated in the paper as shown below :
><img src="https://github.com/programmer290399/resources/raw/master/semantic_loss_equations.png"/>

* So we start with the universal embeddings we computed using text and image towers with their respective class labels.
* The next step is to pick unique pairs of embeddings and their respective classes.
* Then there are two branches, one is the set of pairs of class labels which are sent to semantic graph map , (which is nothing but the dictionary which stores the index vaues where the already computed class embedding similarity can be found) from where we get the indexes and use them to fetch the similarity value (which is the cosine similarity between them)(as referred in the eqn by Aij) from the semantic graph.
* The other branch simultaneously computes the cosine similarity between the embedding pairs we formed earlier.
* Then two branches merge and check whether both the embedding pair similarity value and the one fetched from semantic graph are lesser than the margin 
* According to which we decide the value of sigma (which is shown in the equation above by σ) 
* Then we compute the pair wise difference and square it as per the loss equation and multiply it with σ
* Finally we divide it by N^2 to complete the computation. 
* This loss tries to enforce semantic graph regularization , which would try that embeddings corresponding to two semantically similar classes are closer than the embeddings corresponding to two semantically different classes  



### <ins>Cross Modal Gap and Cross Modal Loss</ins>:
<img src="https://github.com/programmer290399/resources/raw/master/cross_modal_loss.jpeg" width="500" height="500"/>
<p>Fig 7: Cross Modal Computation</p>
<hr></hr>

* Cross modal loss computation is pretty simple we just pass the universal embedding of text and image of same instance to cosine similarity function and take the avg as defined by the eqn (12) in the paper.
* This is done because afterall both the embeddings represent the same things and thus they must be as close as possible.


In [0]:
def SemanticGraph(classes_list,config=config):
  """
  A function to create semantic graph from list of all unique classes, as 
  described in section 3.3 of the paper.
  NOTE: If you run this function please run it on GPU as it takes only 15-20 min
  to run , on CPU it may take upto 2 hours.
  
  Parameters
  ----------
  classes_list : iterable(list/numpy array), Required
    This should contain all the N classes 
  config : config class object, Optional
    This is used to fetch the device
  
  Returns
  -------
  semantic_graph : torch.tensor[N,N]
    The semantic graph is actually the adjacency matrix where each element on 
    row i and column j represents the cosine distance between the embeddings 
    of i th and j th classes.
  classes_combs_map : dict
    This dict contains the mapping of class name combinations and the indices 
    where we can find the cosine distance corresponding to the combination in 
    the semantic_graph 
  """

  # Here we split the class names and attach index to them for mapping 
  # combinations later 
  classes_list_split = [class_name.split('<')+[idx] for idx,class_name in enumerate(classes_list)]
  
  # This function is to reconstruct the original class names for mapping them 
  # later
  reconstruct_keys = lambda class_prod : "|".join(map(lambda inp: "<".join(inp[:-1]),class_prod))
  
  # Fetching total number of classes
  num_classes = len(classes_list_split)
  
  # Loading bert model for embedding generation and moving it to the device
  # For more info see : https://github.com/UKPLab/sentence-transformers
  model = SentenceTransformer('bert-base-nli-mean-tokens')
  model = model.to(config.device)
  
  # Initializing empty semantic graph and class_combination_map
  semantic_graph = torch.zeros(num_classes,num_classes).to(config.device)
  classes_combs_map = dict()
  
  # Here we compute all the unique combinations so that we can compute 
  # distance efficiently 
  classes_combs = combinations(classes_list_split,r=2)
  
  # The magic number below 36585 is nothing but the total number of combinations
  # of the 271 classes , more precisely C(271,2)=36585
  with progressbar.ProgressBar(max_value=36585) as bar:

    # Here we iterate over the above computed unique combinations
    for _,class_prod in enumerate(classes_combs):

      # Getting the class indices for mapping 
      i,j = class_prod[0][-1],class_prod[1][-1]

      # A lambda function to get embedding for classes
      get_embd = lambda x : torch.tensor(model.encode(x)).to(config.device)
      
      # Generating embedding for the class names 
      emb1,emb2 = get_embd(class_prod[0][:-1]), get_embd(class_prod[1][:-1])

      # Computing cosine similarity between the two embeddings generated above
      cos_sim = F.cosine_similarity(emb1.reshape(1,-1),emb2.reshape(1,-1))
      
      # Assigning the values in the graph and map, note that our all 
      #combinations are unique and thus if we have (i,j) as a combination we 
      # don't have (j,i) neither we compute it to save time as it is the same.
      semantic_graph[i][j] = semantic_graph[j][i] = cos_sim
      classes_combs_map[reconstruct_keys(class_prod)] = (i,j)
      
      bar.update(_)       
  
  # We didn't compute similarities for cases where i == j as it is always 1 for 
  # cosine similarity so we fill those indices in the graph and map as well 
  semantic_graph[np.diag_indices_from(semantic_graph)] = 1.0
  for i,class_name in enumerate(classes_list):
    classes_combs_map[class_name +'|'+ class_name] = (i,i)

  return semantic_graph, classes_combs_map
  
class Loss():
  """
  A class for calculation of various losses as mentioned in the paper

  ...

  Attributes
  ----------
  alpha : float
    The coeff. for balancing classification loss
  beta : float 
    The coeff. for balancing semantic loss
  gamma : float 
    The coeff. for balancing cross modal loss
  margin : float 
    margin for Relaxation coeff.for enforcing regularization of semantic classes 
    which are closer than the margin and make other embedding pairs at least as 
    large as the margin.
  
  Methods
  -------
  ClassificationLoss(Huse_out, labels)
    Returns classification loss 
  SemanticLoss(semantic_graph,classes_combs_map,Univ_embd_batch,Batch_labels,margin)
    Returns semantic loss as described in section 3.4.2 of the paper
  CrossModalLoss(Univ_txtEmbd, Univ_imgEmbd)
    Returns cross modal loss as desctibed in section 3.4.3 of the paper
  TotalLoss(ClassificationLoss,SemanticLoss,CrossModalLoss)
    Returns the final loss as given by equation (5) in the paper
  """

  def __init__(self,config):
    """
    Parameters
    ----------
    config : config class object
      All the attributes are fetched from this object
    """

    self.alpha = config.α
    self.beta = config.β
    self.gamma = config.γ
    self.margin = config.ζ

  def ClassificationLoss(self,Huse_out, labels):
    """
    Parameters
    ----------
      Huse_out : torch.tensor[BS,N], Required
        ouput of the classification layer where BS is batch size and N is total
        number of classes 
      labels : torch.tensor[BS], Required
        the corresponding labels from the dataset
    
    Returns
    -------
      criterion(Huse_out, labels) : torch.tensor
        The cross entropy loss
    """

    criterion = nn.CrossEntropyLoss()
    
    return criterion(Huse_out, labels)

  def SemanticLoss(self,semantic_graph,classes_combs_map,Univ_embd_batch,
                   Batch_labels,margin=None):
    """
    Paramerters
    -----------
      semantic_graph : torch.tensor, Required
        The semantic graph as generated by the SemanticGraph function 
      classes_combs_map : dict, Required  
        The class combination to index mapping for semantic graph as returned by
        SemanticGraph function
      Univ_embd_batch : torch.tensor, Required
        The batch of universal embeddings created by text or image tower 
      Batch_labels : torch.tensor, Required 
        The class labels of the batch.
      margin : float, Optional 
        margin as discussed above in class attributes description  
    
    Returns
    -------
      loss : torch.tensor 
        The semantic loss as described above in Methods description
    """

    # We fetch the batch size and margin if not passed 
    batch_size = Univ_embd_batch.shape[0] 
    margin = self.margin if margin is None else margine

    # We create unique combinations of indexes and compute its length
    idx_combinations = list(combinations(range(batch_size),r=2))
    num_combinations = len(idx_combinations)
    
    # We initialize 2 column vectors, cosine_similarity_vals & semantic_graph_vals 
    # for storing the cosine similarity of the embedding combinations and the 
    # semantic graph values of thier respective class combinations respectively. 
    cosine_similarity_vals = torch.zeros(num_combinations,1,dtype=torch.float32).to(config.device)
    semantic_graph_vals = torch.zeros(num_combinations,1,dtype=torch.float32).to(config.device)
    
    # Iterating over the combinations computed above 
    for i,idx_comb in enumerate(idx_combinations):

      # fetching the indices and embedding values and reshaping them suitably 
      m,n = idx_comb
      univ_embd_m,univ_embd_n = Univ_embd_batch[m].reshape(1,-1),Univ_embd_batch[n].reshape(1,-1)
      
      # Computing cosine similarity between the embedding combination and storing
      # it in the i th row of cosine_similarity_vals
      cosine_similarity = F.cosine_similarity(univ_embd_m,univ_embd_n)
      cosine_similarity_vals[i] = cosine_similarity

      # Here we try to fetch the corresponding similarity value for class 
      # combination say (A,B) but we're not sure that whether the key in the 
      # classes_combs_map is A|B or B|A so we use a try except block
      try :
        batch_label_key = '|'.join([Batch_labels[m][0],Batch_labels[n][0]])
        j,k = classes_combs_map[batch_label_key]
      except:
        batch_label_key = '|'.join([Batch_labels[n][0],Batch_labels[m][0]])
        j,k = classes_combs_map[batch_label_key]
      
      # Storing the similarity value from semantic_graph in the i th row of 
      # semantic_graph_vals
      semantic_graph_vals[i] = semantic_graph[j][k]
    
    # Initializing loss 
    loss = 0 

    # We form pairs by concatenating cosine_similarity_vals & semantic_graph_vals
    cosine_similarity_pairs = torch.cat([cosine_similarity_vals,semantic_graph_vals],dim=1).to(config.device)

    # We iterate through the pairs and compute the relaxing constraint as per 
    # the margin passed and compute the loss accordingly 
    for pair in cosine_similarity_pairs :
      σ = 1 if all([pair[0]<margin , pair[1] < margin]) else 0 
      loss += σ * (pair[0]-pair[1])**2
    loss /= batch_size**2
    
    return loss

  def CrossModalLoss(self, Univ_txtEmbd, Univ_imgEmbd):
    """
    Parameters
    ----------
    Univ_txtEmbd : torch.tensor, Required
      The universal text embedding generated by text tower 
    Univ_imgEmbd : torch.tensor, Required

    Returns
    -------
    loss : torch.tensor
      Cross modal loss as described above in Methods Description 
    """

    return torch.mean(F.cosine_similarity(Univ_imgEmbd, Univ_txtEmbd))

  def TotalLoss(self,ClassificationLoss,SemanticLoss,CrossModalLoss):
    """
    Parameters
    ----------
    ClassificationLoss : torch.tensor, Required 
      output of ClassificationLoss function 
    SemanticLoss : torch.tensor, Required
      output of SemanticLoss function 
    CrossModalLoss : torch.tensor, Required
      output of CrossModalLoss function 
    
    Returns
    -------
    loss as described above in Method description
    """

    return (self.alpha*ClassificationLoss) + (self.beta * SemanticLoss) + (self.gamma * CrossModalLoss)

## PART4: TRAINING 
<hr></hr>
Below you can find a custom dataset class to load the provided dataset and a function to train the networks.

In [0]:
class GreenDeckDataset(Dataset):
  """
  A custom dataset class to load, clean and preprocess the provided dataset  
  """

  def __init__(self,config):
    self.data = pd.read_csv(config.csv_path)
    self.img_root = config.images_folder
    self.class_list = self.data.classes.unique()
    self.tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    self.docs = list(self.data.text)
    self.tfidf_vectors = self.tfidf_vectorizer.fit_transform(self.docs)

  def __getitem__(self, index):
    image_path = self.img_root + self.data.loc[index, 'image']
    image = self._load_image(image_path)

    text_raw = self.data.loc[index,'text']
    text_cleaned  = self._preprocess_text(text_raw)
    class_name = self.data.loc[index,'classes']
    class_idx = np.where(self.class_list == class_name)
    tfidf_vector = torch.tensor(self.tfidf_vectors[index].toarray(),
                                dtype=torch.float32)
    
    return image,text_cleaned,class_name,tfidf_vector,class_idx

  def __len__(self):
    return self.data.shape[0]

  def _load_image(self,img_path, max_size=400, shape=None):
    # Load in and transform an image, making sure the image
    #  is <= 400 pixels in the x-y dims
    
    image = Image.open(img_path).convert('RGB')
    
    # large images will slow down processing
    if max(image.size) > max_size:
        size = max_size
    else:
        size = max(image.size)
    
    if shape is not None:
        size = shape
        
    in_transform = transforms.Compose([
                        transforms.Resize(size),
                        transforms.RandomRotation(degrees=15),
                        transforms.RandomHorizontalFlip(),
                        transforms.ToTensor(),
                        transforms.Normalize((0.485, 0.456, 0.406), 
                                             (0.229, 0.224, 0.225))])

    # discard the transparent, alpha channel (that's the :3) and add the batch dimension
    image = in_transform(image)[:3,:,:].unsqueeze(0)
    
    return image
  
  def _convert_lower_case(self,data):
    # Convert text to all lower case 

    return np.char.lower(data)

  def _remove_stop_words(self,data):
    # Remove stop words from text

    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text
  
  def _remove_punctuation(self,data):
    # Remove punctuations from text

    for i in range(len(punctuation)):
        data = np.char.replace(data, punctuation[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

  def _remove_apostrophe(self,data):
    # Remove apostroshe from text 

    return np.char.replace(data, "'", "")

  def _stemming(self,data):
    # stemming the given text

    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text
  
  def _convert_numbers(self,data):
    # convert numbers to words eg. 101 --> one hundred and one

      tokens = word_tokenize(str(data))
      new_text = ""
      for w in tokens:
          try:
              w = num2words(int(w))
          except:
              a = 0
          new_text = new_text + " " + w
      new_text = np.char.replace(new_text, "-", " ")
      return new_text
    
  def _preprocess_text(self,data):
    # Use all other text cleaning methods to clean the text.

    data = self._convert_lower_case(data)
    data = self._remove_punctuation(data) #remove comma seperately
    data = self._remove_apostrophe(data)
    data = self._remove_stop_words(data)
    data = self._convert_numbers(data)
    data = self._stemming(data)
    data = self._remove_punctuation(data)
    data = self._convert_numbers(data)
    data = self._stemming(data) #needed again as we need to stem the words
    data = self._remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = self._remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    
    return data  

In [0]:
testds = GreenDeckDataset(config)
classes_list = testds.class_list
semantic_graph, class_comb_map = get_semantic_graph(classes_list) 

In [0]:
def train(config         = config,
          dataset        = GreenDeckDataset(config),
          ImgEmbdGen     = ImageEmbeddingGen(config),
          TxtEmbdGen     = TextEmbeddingGen(config),
          TxtTower       = TextTower(config),
          ImgTower       = ImageTower(config),
          HUSEClassifier = HUSEClassifier(config), 
          Loss           = Loss(config),
          semantic_graph = semantic_graph,
          classes_combs_map = class_comb_map):
  """
  A function to train the HUSE networks parts viz. HUSEClassifier, TextTower &
  ImageTower

  Parameters
  ----------

  config : config class obj, Optional
    All the hyperparameters and other required info is fetched from this obj 
  dataset : torch.utils.data.Dataset, Optional
    The complete function fetches data using this dataset obj 
  ImgEmbdGen : ImageEmbeddingGen class obj, Optional
    This is used to convert images to input embeddings for image tower
  TxtEmbdGen : TextEmbeddingGen class obj, Optional
    This is used to convert text to input embeddings for text tower
  TxtTower : torch.nn.Module, Optional
    This converts text embeddings to universal embeddings
  ImgTower : torch.nn.Module, Optional
    This converts image embeddings to universal embeddings
  HUSEClassifier : torch.nn.Module, Optional 
    This is the shared classification layer of the model
  Loss : Loss class obj, Optional
    This class incorporates all the 3 losses in it 
  semantic_graph : torch.tensor, Optional 
    The semantic graph as generated by the SemanticGraph function
  classes_combs_map : dict, Optional 
    The class combination to index mapping for semantic graph as returned by
    SemanticGraph function

  Returns
  -------
  TxtTower : torch.nn.Module
    The trained text tower 
  ImgTower : torch.nn.Module 
    The trained image tower 
  HUSEClassifier : torch.nn.Module
    The trained shared classification model 
  """
  
  # We pass the dataset to dataloader to break it and load data in batches
  data_loader = DataLoader(dataset,batch_size=config.batch_size,shuffle=True)
  
  # We collect all training parameters from all the three models 
  all_parameters = list(TxtTower.parameters()) + list(ImgTower.parameters()) + list(HUSEClassifier.parameters())
  
  # We load all the models to the device we're using 
  TxtTower.to(config.device)
  ImgTower.to(config.device)
  HUSEClassifier.to(config.device)

  # We pass all the parameters to update and all the other hyperparameters 
  # to RMSprop optimizer as mentioned in section 4.1.3 of the paper
  optimizer  = RMSprop(all_parameters, lr=config.lr, momentum=config.momentum)
  
  for epoch_num in range(config.num_epoch):
    
    print("--" * 5)
    print("Epoch_num:",epoch_num+1)
    
    epoch_loss = 0.0
    
    with progressbar.ProgressBar(max_value=len(data_loader)) as bar:

      for batch_num,batch in enumerate(data_loader):
        
        # We fetch the batch data and move it to the device in use 
        images_batch = batch[0].to(config.device)
        text_batch   = batch[1]
        class_name_labels    = batch[2]
        tf_idf_vectors_batch = torch.squeeze(batch[3]).to(config.device)
        class_idx_batch = torch.squeeze(batch[4][0]).to(config.device)
        

        with torch.no_grad():

          image_embeddings_batch = ImgEmbdGen.get_ImgEmbd(torch.squeeze(images_batch)).to(config.device)
          text_embeddings_batch  = torch.cat(list(map(TxtEmbdGen.get_TxtEmbd_bert,text_batch)),dim=0).to(config.device)
          final_text_embeddings_batch = torch.cat([text_embeddings_batch,tf_idf_vectors_batch],dim = 1).to(config.device)
        
        optimizer.zero_grad()
        TxtTower.zero_grad()
        ImgTower.zero_grad()
        HUSEClassifier.zero_grad()

        with torch.set_grad_enabled(True):

          universal_embedding_image = ImgTower(image_embeddings_batch)
          universal_embedding_text = TxtTower(final_text_embeddings_batch)
          HUSEClassifier_input = torch.cat([universal_embedding_image,universal_embedding_text],dim=1)
          HUSEClassifier_output = HUSEClassifier(HUSEClassifier_input)
        
        
        batch_labels_ndarr = np.array(class_name_labels).reshape(config.batch_size,1)
        batch_labels_ndarr = np.concatenate((batch_labels_ndarr, batch_labels_ndarr),axis=0)
        all_univeral_embeddings = torch.cat([universal_embedding_text,universal_embedding_image],dim = 0)
        semantic_loss = Loss.SemanticLoss(semantic_graph,classes_combs_map,all_univeral_embeddings,batch_labels_ndarr)
        cross_modal_loss = Loss.CrossModalLoss(universal_embedding_text,universal_embedding_image)
        classification_loss = Loss.ClassificationLoss(HUSEClassifier_output, class_idx_batch)
        total_loss = Loss.TotalLoss(classification_loss, semantic_loss, cross_modal_loss)
        total_loss.backward()
        optimizer.step()
        epoch_loss += total_loss * config.batch_size
        bar.update(batch_num)

    print(f"Training loss : {epoch_loss/len(data_loader.dataset)}")
    print("--" * 5)

  return TxtTower, ImgTower, HUSEClassifier 

In [0]:
train()

# Submitted By : Saahil Ali 
## Email       : programmer290399@gmail.com
## LinkedIn    : https://www.linkedin.com/in/saahil-ali-290399/
## Contact No. : +91-9981789723