Based on the blog/notebook [The Annotated Transformer](http://nlp.seas.harvard.edu/annotated-transformer/#background)

In [None]:
import os
import math
import copy
import time
import warnings
from os.path import exists

import pandas as pd

# import spacy


import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
# import GPUtil
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets


# import altair as alt

# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

import platform
print(platform.platform())

print(torch.__version__)

# Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")


## Self attention

### Single head attention
Maps a set $(x_1,x_2, \ldots, x_T) \in \mathbb{R}^{d_k}$ to a set $(z_1,z_2, \ldots, z_T) \in \mathbb{R}^{d_v}$. The mapping is parametrized by three linear operators (matrics):
- `key`: $W_k \in \mathbb{R}^{d_e \times d_k}$ transform a token $x_i$ to a **key** view $k_i =W_kx_i \in \mathbb{R}^{d_e}$
- `query` : $W_q \in \mathbb{R}^{d_e \times d_k}$ transform a token $x_i$ to a **query** view $q_i =W_qx_i  \in \mathbb{R}^{d_e}$
- `value`: $W_v$ transform a token $x_i$ to a **value** view $v_i =W_vx_i  \in \mathbb{R}^{d_v}$
$$
 z_i = \sum_{j=1}^n p_i(j)v_j \;,\; p_i(j) = \frac{\exp( <q_i, k_j>)}{Z_i}
$$
that cab be represented in matrix form as
$$
Z = \tt{softmax}(Q \cdot K^T) V
$$
In practice, we use a `scaled` version (see [the blog](http://nlp.seas.harvard.edu/annotated-transformer/#background) for explanation)
$$
Z = \tt{softmax}(\frac{Q \cdot K^T}{\sqrt{d}}) V
$$

In [None]:
# example:
# H = 1      # number of heads
# T =3       # Number of time steps 
# d_e = 4,   # dimension of the input embedding vectors
# d_k = 2    # dimension of key and query vectors
# d_v = d_e / H

H = 1
T = 3
d_e = 4
d_k = 2
d_v = 5 

x_1 = torch.randn([d_e])
x_2 = torch.randn([d_e])
x_3 = torch.randn([d_e])
print(x_1.shape, x_2.shape, x_3.shape)
X = torch.stack([x_1,x_2,x_3])
print(f'X shape: {X.shape} [{T} X {d_e}]')

W_k = torch.randn((d_k, d_e))

k_1 = torch.matmul(W_k, x_1)
k_2 = torch.matmul(W_k, x_2)
k_3 = torch.matmul(W_k, x_3)
K_manual = torch.stack([k_1,k_2,k_3])
K = torch.matmul(X, W_k.T)              # same as K = torch.matmul(W_k, X.T).T

print(f'K_manual shape: {K_manual.shape} [{T} X {d_k}]')
print(f'K shape: {K.shape}   [err={torch.linalg.norm(K-K_manual)}]')

W_q = torch.randn((d_k, d_e))
Q = torch.matmul(X, W_q.T)            
print(f'Q shape: {Q.shape}  [{T} X {d_k}]')

scaled_dot_prod_attention = torch.matmul(Q, K.T) / math.sqrt(d_k)
print(f'dot-product attention shape: {scaled_dot_prod_attention.shape}  [{T} X {T}]')

# Do the softmax to get in each row, a distribution 
sfmx = scaled_dot_prod_attention.softmax(dim=-1)
print(f'Number of rows: {sfmx.shape[0]}, sums each rows={torch.sum(sfmx, dim=-1)}')

W_v = torch.randn((d_v, d_e))
V = torch.matmul(X, W_v.T)            
print(f'V shape: {V.shape}  [{T} X {d_v}]')

# # munliply scores with values to get output tokens
Z = sfmx @  V
print(f'Shape of output tokens: {Z.shape} [{T} X {d_v}]')


#  # vectorized representation
# V = X @ W_v
# print(f'key reresentation for x  :\n {K}')       # vectorized representation

# print(f'qry reresentation for x  :\n {Q}')      
# print(f'val reresentation for x  :\n {V}')       # vectorized representation

# # compute (dot-product) attention scores
# dot_prod_attention = (Q @ K.T) / torch.sqrt(d)
# sfmx = dot_prod_attention.softmax(dim=-1)
# print(f'all attention scores:\n  {dot_prod_attention}')
# print(f'Attention scores for x_1: {dot_prod_attention[0,:]}')

# print(f'V = {V}')
# print(f'softmaxing: {sfmx}')

# # munliply scores with values to get output tokens
# Z = sfmx @  V
# print(Z)


In [None]:
xx_1 = torch.tensor([1.0,1.0,1.0,1.0])
xx_2 = torch.tensor([2.0,2.0,2.0,2.0])
xx_3 = torch.tensor([3.0,3.0,3.0,3.0])
XX = torch.stack([xx_1,xx_2,xx_3])
print(XX.shape)
print(f'sums each rows={torch.sum(XX, dim=-1)}')
print(f'sums each colimn={torch.sum(XX, dim=0)}')



In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        print(f'Input tensor: sequence length={x.shape[0]}, batch_size={x.shape[1]}, embedding dimension={x.shape[2]}')
        print(f'Shape of pe: {self.pe.shape}')
        x_add = self.pe[:x.size(0), :]
        print(f'add to input: {x_add.shape}')
        print(f'add to first token: {x_add[0,0,:]}')
        print(f'add to first token: {x_add[1,0,:]}')
        print(f'add to third token: {x_add[2,0,:]}')
        x = x + x_add
        return x

p_enc = PositionalEncoding(d_e)

x_in = torch.ones((T, 32, d_e))
x_w_p_enc = p_enc(x_in)
print(x_in.shape, x_w_p_enc.shape)
# print(x_in)
# print(x_w_p_enc)

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    print(query.size(), d_k)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    # scores = torch.matmul(query, key.T) # / math.sqrt(d_k)

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

print(Q.shape)
Z1, patt = attention(Q, K, V)
print(torch.norm(Z1-Z))

### Multi head attention

In [None]:
def load_model(model_dir):
  # model_dir = Path(model_dir)/"saved_model"
  # model_dir = Path(model_dir)
  # print('Model loaded to: {}'.format(model_dir))
  model = tf.saved_model.load(str(model_dir))
  model = model.signatures['serving_default']

  return model

def run_inference_for_single_image(model, image):
    image = np.asarray(image)
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis,...]

    # Run inference
    output_dict = model(input_tensor)

    # All outputs are batches tensors.
    # Convert to numpy arrays, and take index [0] to remove the batch dimension.
    # We're only interested in the first num_detections.
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key:value[0, :num_detections].numpy() 
                    for key,value in output_dict.items()}
    output_dict['num_detections'] = num_detections

    # detection_classes should be ints.
    output_dict['detection_classes'] = output_dict['detection_classes'].astype(np.int64)
    
    return output_dict

In [None]:
saved_model_dir = '/opt/models/nexserve/constructions_v4_rfcn_high_res_b/1'
saved_model = tf.saved_model.load(saved_model_dir)
saved_model = saved_model.signatures['serving_default']


In [None]:
input_image = '/opt/datasets/cones.jpg'
image_np = np.array(Image.open(input_image))
output_dict = run_inference_for_single_image(saved_model, image_np)

visualize_boxes_and_labels_on_image_array(
    image_np,
    output_dict['detection_boxes'],
    output_dict['detection_classes'],
    output_dict['detection_scores'],
    use_normalized_coordinates=True,
    min_score_thresh=0.3,
    line_thickness=8)



display(Image.fromarray(image_np))

In [None]:
 print(output_dict.keys())
 print(output_dict['detection_classes'])

In [None]:
    image = np.asarray(image)
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis,...]

    # Run inference
    output_dict = model(input_tensor)

In [None]:
print(saved_model_dir)
!saved_model_cli show --dir $saved_model_dir --tag_set serve --signature_def serving_default


In [None]:
import PIL.Image as Image
import PIL.ImageColor as ImageColor
import PIL.ImageDraw as ImageDraw
import PIL.ImageFont as ImageFont

STANDARD_COLORS = [
    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
    'WhiteSmoke', 'Yellow', 'YellowGreen'
]

def draw_bounding_box_on_image(image,
                               ymin,
                               xmin,
                               ymax,
                               xmax,
                               color='red',
                               thickness=4,
                               display_str_list=(),
                               use_normalized_coordinates=True):

  draw = ImageDraw.Draw(image)
  im_width, im_height = image.size
  if use_normalized_coordinates:
    (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
                                  ymin * im_height, ymax * im_height)
  else:
    (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
  draw.line([(left, top), (left, bottom), (right, bottom),
             (right, top), (left, top)], width=thickness, fill=color)
  try:
    font = ImageFont.truetype('arial.ttf', 24)
  except IOError:
    font = ImageFont.load_default()

  # If the total height of the display strings added to the top of the bounding
  # box exceeds the top of the image, stack the strings below the bounding box
  # instead of above.
  display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
  # Each display_str has a top and bottom margin of 0.05x.
  total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)

  if top > total_display_str_height:
    text_bottom = top
  else:
    text_bottom = bottom + total_display_str_height
  # Reverse list and print from bottom to top.
  for display_str in display_str_list[::-1]:
    text_width, text_height = font.getsize(display_str)
    margin = np.ceil(0.05 * text_height)
    draw.rectangle(
        [(left, text_bottom - text_height - 2 * margin), (left + text_width,
                                                          text_bottom)],
        fill=color)
    draw.text(
        (left + margin, text_bottom - text_height - margin),
        display_str,
        fill='black',
        font=font)
    text_bottom -= text_height - 2 * margin

def draw_bounding_box_on_image_array(
  image,
  ymin,
  xmin,
  ymax,
  xmax,
  color='red',
  thickness=4,
  display_str_list=(),
  use_normalized_coordinates=True):

  image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
  draw_bounding_box_on_image(image_pil, ymin, xmin, ymax, xmax, color,
                            thickness, display_str_list,
                            use_normalized_coordinates)
  np.copyto(image, np.array(image_pil))

def visualize_boxes_and_labels_on_image_array(
    image,
    boxes,
    classes,
    scores,
    use_normalized_coordinates=False,
    max_boxes_to_draw=20,
    min_score_thresh=.5,
    agnostic_mode=False,
    line_thickness=4,
    groundtruth_box_visualization_color='black',
    skip_scores=False,
    skip_labels=True):

  # Create a display string (and color) for every box location, group any boxes
  # that correspond to the same location.
  box_to_display_str_map = collections.defaultdict(list)
  box_to_color_map = collections.defaultdict(str)
  box_to_instance_masks_map = {}
  box_to_instance_boundaries_map = {}
  box_to_keypoints_map = collections.defaultdict(list)
  if not max_boxes_to_draw:
    max_boxes_to_draw = boxes.shape[0]
  for i in range(min(max_boxes_to_draw, boxes.shape[0])):
    if scores is None or scores[i] > min_score_thresh:
      box = tuple(boxes[i].tolist())
      if scores is None:
        box_to_color_map[box] = groundtruth_box_visualization_color
      else:
        display_str = ''
        # if not skip_labels:
        #   if not agnostic_mode:
        #     if classes[i] in category_index.keys():
        #       class_name = category_index[classes[i]]['name']
        #     else:
        #       class_name = 'N/A'
        #     display_str = str(class_name)
        if not skip_scores:
          if not display_str:
            display_str = '{}%'.format(int(100*scores[i]))
          else:
            display_str = '{}: {}%'.format(display_str, int(100*scores[i]))
        box_to_display_str_map[box].append(display_str)
        if agnostic_mode:
          box_to_color_map[box] = 'DarkOrange'
        else:
          box_to_color_map[box] = STANDARD_COLORS[
              classes[i] % len(STANDARD_COLORS)]

  # Draw all boxes onto image.
  for box, color in box_to_color_map.items():
    ymin, xmin, ymax, xmax = box
    draw_bounding_box_on_image_array(
        image,
        ymin,
        xmin,
        ymax,
        xmax,
        color=color,
        thickness=line_thickness,
        display_str_list=box_to_display_str_map[box],
        use_normalized_coordinates=use_normalized_coordinates)

  return image



