import torch
import torchvision
import torch_tensorrt

import shutil
torch.set_printoptions(linewidth=shutil.get_terminal_size().columns - 1, edgeitems=10)  # Set nicer formatting.

torch_tensorrt.logging.set_reportable_log_level(torch_tensorrt.logging.Level.Graph)

torch.manual_seed(0)

DEVICE = torch.device("cuda:0")


class Deformable_Convolution(torch.nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, dilation=1, groups=1, offset_groups=1):
    super().__init__()
    # Deformable convolution
    offset_channels = 2 * kernel_size * kernel_size
    self.conv2d_offset = torch.nn.Conv2d(
      in_channels,
      offset_channels * offset_groups,
      kernel_size=kernel_size,
      stride=stride,
      padding=dilation,
      dilation=dilation,
    )
    self.conv2d = torchvision.ops.DeformConv2d(
      in_channels,
      out_channels,
      kernel_size=kernel_size,
      stride=stride,
      padding=dilation,
      dilation=dilation,
      groups=groups,
      bias=False
    )

  def forward(self, x):
    offset = self.conv2d_offset(x)
    x = self.conv2d(x, offset)
    return x


class MyModel(torch.nn.Module):
  def __init__(self):
    super(MyModel, self).__init__()
    self.input_size = [3, 4, 4]
    self.rpn_num_features = 5
    self.backbone = torch.nn.Sequential(
      Deformable_Convolution(3, 3), # Keep a deformable convolution from my original repro script to show that that isn't an issue now.
    )
    self.make_scores = torch.nn.Conv2d(in_channels=3, out_channels=2 * self.rpn_num_features, kernel_size=1, stride=1, padding=0)
    self.make_boxes = torch.nn.Conv2d(in_channels=3, out_channels=4 * self.rpn_num_features, kernel_size=1, stride=1, padding=0)

    # The following forces an RPN anchor to always be true to avoid a convolution on an empty tensor at a later stage in the original model.
    self.register_buffer('false_anchor_score', torch.tensor((-10, 10), dtype=torch.float).unsqueeze(dim=0))
    self.anchor_always_index = torch.rand(self.rpn_num_features).max(dim=0).indices
    # self.anchor_always_index = self.anchor_always_index.item() # Using this instead avoids the original error
    #   RuntimeError: [Error thrown at core/partitioning/shape_analysis.cpp:68] Expected ivalues_maps.count(input) to be true but got false
    #   Could not find torch::jit::Value* 61 produced from %x.1 : Tensor, %61 : Long(requires_grad=0, device=cpu) = prim::Param() in lowering graph for mini graph input.
    # but gives
    #   RuntimeError: [Error thrown at ./core/conversion/var/Var_inl.h:37] Expected isIValue() to be true but got false
    #   Requested unwrapping of arg assuming it was an IValue, however arg type is nvinfer1::ITensor

  def fwd_rpn(self, x):
    scores = self.make_scores(x).reshape(x.shape[0], 2, -1).transpose(-2, -1)
    scores[:, self.anchor_always_index, :] = self.false_anchor_score
    # scores[:, self.anchor_always_index, :] = torch.tensor((-10, 10), dtype=torch.float, device=scores.device).unsqueeze(dim=0) # This avoids the "arg type is nvinfer1::ITensor" error.
    boxes = self.make_boxes(x).reshape(x.shape[0], 4, -1).transpose(-2, -1)
    return scores, boxes

  def forward(self, x):
    x = self.backbone(x)
    scores, boxes = self.fwd_rpn(x)
    return x, scores, boxes


if __name__ == "__main__":
  model = MyModel().eval().to(DEVICE)
  SHAPE2 = (1, *model.input_size)
  tensor = torch.randn(SHAPE2, dtype=torch.float32, device=DEVICE)
  with torch.inference_mode():
    out, scores, boxes = model(tensor)

  model_trt = torch_tensorrt.compile(
    model,
    inputs=[torch_tensorrt.Input(shape=SHAPE2), ],
    enabled_precisions={torch.float},
    truncate_long_and_double=True
  )
  with torch.inference_mode():
    out_trt, scores_trt, boxes_trt = model(tensor)

  assert torch.max(torch.abs(out - out_trt)) < 1e-6
  print("success")