MUI Decoder: Support uniform sampling in decoder itself (so decoding …

…only what is needed) Summary: TSIA Differential Revision: D56020516 fbshipit-source-id: 6d5537886d648dfbd94724c86b1fb3235b370cbd
pytorch · Apr 11, 2024 · ec4e27f · ec4e27f
1 parent 4eb67d1
commit ec4e27f
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 0 deletions.
diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
@@ -548,6 +548,7 @@ int Decoder::getFrame(size_t workingTimeInMs) {
       continue;
     }
 
+
     size_t numConsecutiveNoBytes = 0;
     // it can be only partial decoding of the package bytes
     do {
@@ -590,6 +591,17 @@ int Decoder::getFrame(size_t workingTimeInMs) {
     result = 0;
 
     av_packet_unref(avPacket);
+
+    if (++kFramesDecoded_ == params_.uniformSampling) {
+      result = ENODATA;
+      flushStreams();
+      break;
+    }
+
+    int64_t stepTs = static_cast<int64_t>((params_.expectedDuration * AV_TIME_BASE) / (params_.uniformSampling - 1));
+    while (kFramesDecoded_ < params_.uniformSampling && avformat_seek_file(inputCtx_, -1, stepTs * (kFramesDecoded_ - 1) + 1, stepTs * kFramesDecoded_, stepTs * kFramesDecoded_, 0) < 0) {
+      ++kFramesDecoded_;
+    }
   }
 
   av_packet_free(&avPacket);

diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h
@@ -89,5 +89,6 @@ class Decoder : public MediaDecoder {
   AVIOContext* avioCtx_{nullptr};
   std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
   std::bitset<64> inRange_;
+  int kFramesDecoded_{0};
 };
 } // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
@@ -219,6 +219,12 @@ struct DecoderParameters {
   // it is dispersed into the stream, but will increase latency. Must be an
   // integer not lesser than 32. It is 5000000 by default.
   int64_t probeSize{5000000};
+
+  // Expected duration of the video to be decoded, mainly used with uniform sampling
+  float expectedDuration{0.0f};
+
+  // Sample N key-frames from the video roughly uniformly across the timeline
+  int uniformSampling{0};
 };
 
 struct DecoderHeader {