feat: This patch adds support for DTS:X Profile 2 audio in MP4 files. (…

…#1303) feat: Added audio specific configuration udts box to AudioSampleEntry for MP4 input/output. DASH tags for DTS audio as specified in ETSI TS 103 491 and ETSI TS 102 114. Closes #1301 --------- Co-authored-by: Cosmin Stejerean <cstejerean@meta.com>
shaka-project · Feb 15, 2024 · 07f780d · 07f780d
1 parent f7b3986
commit 07f780d
Show file tree

Hide file tree

Showing 23 changed files with 373 additions and 3 deletions.
diff --git a/packager/app/test/packager_test.py b/packager/app/test/packager_test.py
@@ -927,6 +927,13 @@ def testAacHe(self):
         self._GetFlags(output_dash=True))
     self._CheckTestResults('acc-he')
 
+  def testDtsx(self):
+    self.assertPackageSuccess(
+        self._GetStreams(
+            ['audio'], test_files=['bear-dtsx.mp4']),
+        self._GetFlags(output_dash=True))
+    self._CheckTestResults('dtsx-dash')
+
   def testVideoAudioWebVTT(self):
     audio_video_streams = self._GetStreams(['audio', 'video'])
     text_stream = self._GetStreams(['text'], test_files=['bear-english.vtt'])

diff --git a/packager/app/test/testdata/dtsx-dash/bear-dtsx-audio.mp4 b/packager/app/test/testdata/dtsx-dash/bear-dtsx-audio.mp4
diff --git a/packager/app/test/testdata/dtsx-dash/output.mpd b/packager/app/test/testdata/dtsx-dash/output.mpd
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--Generated with https://github.com/shaka-project/shaka-packager version <tag>-<hash>-<test>-->
+<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 DASH-MPD.xsd" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" minBufferTime="PT2S" type="static" mediaPresentationDuration="PT3.114667S">
+  <Period id="0">
+    <AdaptationSet id="0" contentType="audio" subsegmentAlignment="true">
+      <Representation id="0" bandwidth="227665" codecs="dtsx" mimeType="audio/mp4" audioSamplingRate="48000">
+        <AudioChannelConfiguration schemeIdUri="tag:dts.com,2018:uhd:audio_channel_configuration" value="0000003F"/>
+        <BaseURL>bear-dtsx-audio.mp4</BaseURL>
+        <SegmentBase indexRange="742-821" timescale="48000">
+          <Initialization range="0-741"/>
+        </SegmentBase>
+      </Representation>
+    </AdaptationSet>
+  </Period>
+</MPD>
diff --git a/packager/media/base/audio_stream_info.cc b/packager/media/base/audio_stream_info.cc
@@ -150,6 +150,8 @@ std::string AudioStreamInfo::GetCodecString(Codec codec,
       return "dts-";
     case kCodecDTSP:
       return "dts+";
+    case kCodecDTSX:
+      return "dtsx";
     case kCodecEAC3:
       return "ec-3";
     case kCodecAC4:

diff --git a/packager/media/base/fourccs.h b/packager/media/base/fourccs.h
@@ -58,6 +58,7 @@ enum FourCC : uint32_t {
   FOURCC_dtsl = 0x6474736c,
   FOURCC_dtsm = 0x6474732d,  // "dts-"
   FOURCC_dtsp = 0x6474732b,  // "dts+"
+  FOURCC_dtsx = 0x64747378,  // "dtsx"
   FOURCC_dvcC = 0x64766343,
   FOURCC_dvh1 = 0x64766831,
   FOURCC_dvhe = 0x64766865,
@@ -151,8 +152,9 @@ enum FourCC : uint32_t {
   FOURCC_trex = 0x74726578,
   FOURCC_trun = 0x7472756e,
   FOURCC_udta = 0x75647461,
-  FOURCC_url = 0x75726c20,  // "url "
-  FOURCC_urn = 0x75726e20,  // "urn "
+  FOURCC_udts = 0x75647473,  // "udts"
+  FOURCC_url = 0x75726c20,   // "url "
+  FOURCC_urn = 0x75726e20,   // "urn "
   FOURCC_uuid = 0x75756964,
   FOURCC_vide = 0x76696465,
   FOURCC_vlab = 0x766c6162,

diff --git a/packager/media/base/stream_info.h b/packager/media/base/stream_info.h
@@ -49,6 +49,7 @@ enum Codec {
   kCodecDTSL,
   kCodecDTSM,
   kCodecDTSP,
+  kCodecDTSX,
   kCodecEAC3,
   kCodecFlac,
   kCodecOpus,

diff --git a/packager/media/codecs/CMakeLists.txt b/packager/media/codecs/CMakeLists.txt
@@ -12,6 +12,7 @@ add_library(media_codecs STATIC
     avc_decoder_configuration_record.cc
     decoder_configuration_record.cc
     dovi_decoder_configuration_record.cc
+    dts_audio_specific_config.cc
     ec3_audio_util.cc
     ac4_audio_util.cc
     es_descriptor.cc

diff --git a/packager/media/codecs/dts_audio_specific_config.cc b/packager/media/codecs/dts_audio_specific_config.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 Google Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <packager/media/codecs/dts_audio_specific_config.h>
+
+#include <packager/media/base/bit_reader.h>
+#include <packager/media/base/rcheck.h>
+
+namespace shaka {
+namespace media {
+
+bool GetDTSXChannelMask(const std::vector<uint8_t>& udts, uint32_t& mask) {
+  // udts is the DTS-UHD Specific Box: ETSI TS 103 491 V1.2.1 Table B-2
+  // DecoderProfileCode(6 bits)
+  // FrameDurationCode(2 bits)
+  // MaxPayloadCode(3 bits)
+  // NumPresentationsCode(5 bits)
+  // ChannelMask (32 bits)
+  BitReader bit_reader(udts.data(), udts.size());
+  RCHECK(bit_reader.SkipBits(16));
+  RCHECK(bit_reader.ReadBits(32, &mask));
+  return true;
+}
+
+}  // namespace media
+}  // namespace shaka
diff --git a/packager/media/codecs/dts_audio_specific_config.h b/packager/media/codecs/dts_audio_specific_config.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2023 Google Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef PACKAGER_MEDIA_CODECS_DTS_AUDIO_SPECIFIC_CONFIG_H_
+#define PACKAGER_MEDIA_CODECS_DTS_AUDIO_SPECIFIC_CONFIG_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+namespace shaka {
+namespace media {
+
+class BitReader;
+
+bool GetDTSXChannelMask(const std::vector<uint8_t>& udts, uint32_t& mask);
+
+}  // namespace media
+}  // namespace shaka
+
+#endif  // PACKAGER_MEDIA_CODECS_DTS_AUDIO_SPECIFIC_CONFIG_H_
diff --git a/packager/media/codecs/dts_audio_specific_config_unittest.cc b/packager/media/codecs/dts_audio_specific_config_unittest.cc
@@ -0,0 +1,37 @@
+// Copyright 2023 Google Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <gtest/gtest.h>
+
+#include "packager/media/codecs/dts_audio_specific_config.h"
+
+namespace shaka {
+namespace media {
+
+TEST(DTSAudioSpecificConfigTest, BasicProfileTest) {
+  uint8_t buffer[] = {0x01, 0x20, 0x00, 0x00, 0x0, 0x3F, 0x80, 0x00};
+  std::vector<uint8_t> data(std::begin(buffer), std::end(buffer));
+  uint32_t mask;
+  EXPECT_TRUE(GetDTSXChannelMask(data, mask));
+  EXPECT_EQ(0x3F, mask);
+}
+
+TEST(DTSAudioSpecificConfigTest, ChannelMaskBytes) {
+  uint8_t buffer[] = {0x01, 0x20, 0x12, 0x34, 0x56, 0x78, 0x80, 0x00};
+  std::vector<uint8_t> data(std::begin(buffer), std::end(buffer));
+  uint32_t mask;
+  EXPECT_TRUE(GetDTSXChannelMask(data, mask));
+  EXPECT_EQ(0x12345678, mask);
+}
+
+TEST(DTSAudioSpecificConfigTest, Truncated) {
+  uint8_t buffer[] = {0x01, 0x20, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> data(std::begin(buffer), std::end(buffer));
+  uint32_t mask;
+  EXPECT_FALSE(GetDTSXChannelMask(data, mask));
+}
+
+}  // namespace media
+}  // namespace shaka
diff --git a/packager/media/event/muxer_listener_internal.cc b/packager/media/event/muxer_listener_internal.cc
@@ -20,6 +20,7 @@
 #include <packager/media/base/text_stream_info.h>
 #include <packager/media/base/video_stream_info.h>
 #include <packager/media/codecs/ac4_audio_util.h>
+#include <packager/media/codecs/dts_audio_specific_config.h>
 #include <packager/media/codecs/ec3_audio_util.h>
 #include <packager/mpd/base/media_info.pb.h>
 #include <packager/utils/bytes_to_string_view.h>
@@ -165,6 +166,16 @@ void AddAudioInfo(const AudioStreamInfo* audio_stream_info,
     codec_data->set_ac4_ims_flag(ac4_ims_flag);
     codec_data->set_ac4_cbi_flag(ac4_cbi_flag);
   }
+
+  if (audio_stream_info->codec() == kCodecDTSX) {
+    auto* codec_data = audio_info->mutable_codec_specific_data();
+    uint32_t channel_mask;
+    if (!GetDTSXChannelMask(codec_config, channel_mask)) {
+      LOG(ERROR) << "Failed to parse DTSX channel mask.";
+      return;
+    }
+    codec_data->set_channel_mask(channel_mask);
+  }
 }
 
 void AddTextInfo(const TextStreamInfo& text_stream_info,

diff --git a/packager/media/event/muxer_listener_internal_unittest.cc b/packager/media/event/muxer_listener_internal_unittest.cc
@@ -72,6 +72,24 @@ TEST_F(MuxerListenerInternalVideoStreamTest, TransferCharacteristics) {
   EXPECT_EQ(18u, media_info.video_info().transfer_characteristics());
 }
 
+class MuxerListenerInternalAudioStreamTest : public MuxerListenerInternalTest {
+};
+
+// AddAudioInfo function should parse the channel mask
+TEST_F(MuxerListenerInternalAudioStreamTest, DTSX) {
+  MediaInfo media_info;
+  std::shared_ptr<AudioStreamInfo> audio_info = CreateAudioStreamInfo(
+      GetAudioStreamInfoParams(kCodecDTSX, "dtsx",
+                               {0x01, 0x20, 0x00, 0x00, 0x0, 0x3F, 0x80,
+                                0x00}));  // Channel mask = 3F
+  ASSERT_TRUE(GenerateMediaInfo(MuxerOptions(), *audio_info,
+                                kReferenceTimeScale,
+                                MuxerListener::kContainerMp4, &media_info));
+  MediaInfo_AudioInfo* info = media_info.mutable_audio_info();
+  auto* codec_data = info->mutable_codec_specific_data();
+  EXPECT_EQ(0x3F, codec_data->channel_mask());
+}
+
 }  // namespace internal
 }  // namespace media
 }  // namespace shaka
diff --git a/packager/media/event/muxer_listener_test_helper.cc b/packager/media/event/muxer_listener_test_helper.cc
@@ -104,5 +104,53 @@ std::vector<ProtectionSystemSpecificInfo> GetDefaultKeySystemInfo() {
             std::end(kExpectedDefaultPsshBox) - 1}}};
 }
 
+AudioStreamInfoParameters::AudioStreamInfoParameters() {}
+AudioStreamInfoParameters::~AudioStreamInfoParameters() {}
+
+std::shared_ptr<AudioStreamInfo> CreateAudioStreamInfo(
+    const AudioStreamInfoParameters& param) {
+  return std::make_shared<AudioStreamInfo>(
+      param.track_id, param.time_scale, param.duration, param.codec,
+      param.codec_string, param.codec_config.data(), param.codec_config.size(),
+      param.sample_bits, param.num_channels, param.sampling_frequency,
+      param.seek_preroll_ns, param.codec_delay_ns, param.max_bitrate,
+      param.avg_bitrate, param.language, param.is_encrypted);
+}
+
+AudioStreamInfoParameters GetAudioStreamInfoParams(
+    Codec codec,
+    const char* codec_string,
+    const std::vector<uint8_t>& codec_config) {
+  const int kTrackId = 0;
+  const int32_t kTimeScale = 10;
+  const int64_t kAudioStreamDuration = 200;
+  const char* kLanuageUndefined = "und";
+  const uint8_t kSampleBits = 16;
+  const uint8_t kNumChannels = 6;
+  const uint32_t kSamplingFrequency = 48000;
+  const uint64_t kSeekPrerollNs = 0;
+  const uint64_t kCodecDelayNs = 0;
+  const uint32_t kMaxBitrate = 0;
+  const uint32_t kAvgBitrate = 0;
+  const bool kEncryptedFlag = false;
+  AudioStreamInfoParameters params;
+  params.track_id = kTrackId;
+  params.time_scale = kTimeScale;
+  params.duration = kAudioStreamDuration;
+  params.codec = codec;
+  params.codec_string = codec_string;
+  params.language = kLanuageUndefined;
+  params.sample_bits = kSampleBits;
+  params.num_channels = kNumChannels;
+  params.sampling_frequency = kSamplingFrequency;
+  params.seek_preroll_ns = kSeekPrerollNs;
+  params.codec_delay_ns = kCodecDelayNs;
+  params.max_bitrate = kMaxBitrate;
+  params.avg_bitrate = kAvgBitrate;
+  params.codec_config = codec_config;
+  params.is_encrypted = kEncryptedFlag;
+  return params;
+}
+
 }  // namespace media
 }  // namespace shaka
diff --git a/packager/media/event/muxer_listener_test_helper.h b/packager/media/event/muxer_listener_test_helper.h
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <vector>
 
+#include <packager/media/base/audio_stream_info.h>
 #include <packager/media/base/key_source.h>
 #include <packager/media/base/muxer_options.h>
 #include <packager/media/base/stream_info.h>
@@ -95,6 +96,29 @@ struct VideoStreamInfoParameters {
   bool is_encrypted;
 };
 
+// Struct that gets passed for to CreateAudioStreamInfo() to create a
+// StreamInfo instance. Useful for generating multiple AudioStreamInfo with
+// slightly different parameters.
+struct AudioStreamInfoParameters {
+  AudioStreamInfoParameters();
+  ~AudioStreamInfoParameters();
+  int track_id;
+  int32_t time_scale;
+  int64_t duration;
+  Codec codec;
+  std::string codec_string;
+  std::vector<uint8_t> codec_config;
+  uint8_t sample_bits;
+  uint8_t num_channels;
+  uint32_t sampling_frequency;
+  uint64_t seek_preroll_ns;
+  uint64_t codec_delay_ns;
+  uint32_t max_bitrate;
+  uint32_t avg_bitrate;
+  std::string language;
+  bool is_encrypted;
+};
+
 struct OnNewSegmentParameters {
   std::string file_name;
   int64_t start_time;
@@ -115,6 +139,16 @@ std::shared_ptr<VideoStreamInfo> CreateVideoStreamInfo(
 // Returns the "default" VideoStreamInfoParameters for testing.
 VideoStreamInfoParameters GetDefaultVideoStreamInfoParams();
 
+// Creates StreamInfo instance from AudioStreamInfoParameters.
+std::shared_ptr<AudioStreamInfo> CreateAudioStreamInfo(
+    const AudioStreamInfoParameters& param);
+
+// Returns the "default" configuration for testing given codec and parameters.
+AudioStreamInfoParameters GetAudioStreamInfoParams(
+    Codec codec,
+    const char* codec_string,
+    const std::vector<uint8_t>& codec_config);
+
 // Returns the "default" values for OnMediaEnd().
 OnMediaEndParameters GetDefaultOnMediaEndParams();
 

diff --git a/packager/media/formats/mp4/box_definitions.cc b/packager/media/formats/mp4/box_definitions.cc
@@ -1811,6 +1811,27 @@ size_t DTSSpecific::ComputeSizeInternal() {
          sizeof(kDdtsExtraData);
 }
 
+UDTSSpecific::UDTSSpecific() = default;
+UDTSSpecific::~UDTSSpecific() = default;
+
+FourCC UDTSSpecific::BoxType() const {
+  return FOURCC_udts;
+}
+
+bool UDTSSpecific::ReadWriteInternal(BoxBuffer* buffer) {
+  RCHECK(ReadWriteHeaderInternal(buffer) &&
+         buffer->ReadWriteVector(
+             &data, buffer->Reading() ? buffer->BytesLeft() : data.size()));
+  return true;
+}
+
+size_t UDTSSpecific::ComputeSizeInternal() {
+  // This box is optional. Skip it if not initialized.
+  if (data.empty())
+    return 0;
+  return HeaderSize() + data.size();
+}
+
 AC3Specific::AC3Specific() = default;
 AC3Specific::~AC3Specific() = default;
 
@@ -1983,6 +2004,7 @@ bool AudioSampleEntry::ReadWriteInternal(BoxBuffer* buffer) {
 
   RCHECK(buffer->TryReadWriteChild(&esds));
   RCHECK(buffer->TryReadWriteChild(&ddts));
+  RCHECK(buffer->TryReadWriteChild(&udts));
   RCHECK(buffer->TryReadWriteChild(&dac3));
   RCHECK(buffer->TryReadWriteChild(&dec3));
   RCHECK(buffer->TryReadWriteChild(&dac4));
@@ -2014,7 +2036,7 @@ size_t AudioSampleEntry::ComputeSizeInternal() {
          sizeof(samplesize) + sizeof(samplerate) + sinf.ComputeSize() +
          esds.ComputeSize() + ddts.ComputeSize() + dac3.ComputeSize() +
          dec3.ComputeSize() + dops.ComputeSize() + dfla.ComputeSize() +
-         dac4.ComputeSize() + mhac.ComputeSize() +
+         dac4.ComputeSize() + mhac.ComputeSize() + udts.ComputeSize() +
          // Reserved and predefined bytes.
          6 + 8 +  // 6 + 8 bytes reserved.
          4;       // 4 bytes predefined.

diff --git a/packager/media/formats/mp4/box_definitions.h b/packager/media/formats/mp4/box_definitions.h
@@ -334,6 +334,12 @@ struct DTSSpecific : Box {
   std::vector<uint8_t> extra_data;
 };
 
+struct UDTSSpecific : Box {
+  DECLARE_BOX_METHODS(UDTSSpecific);
+
+  std::vector<uint8_t> data;
+};
+
 struct AC3Specific : Box {
   DECLARE_BOX_METHODS(AC3Specific);
 
@@ -396,6 +402,7 @@ struct AudioSampleEntry : Box {
 
   ElementaryStreamDescriptor esds;
   DTSSpecific ddts;
+  UDTSSpecific udts;
   AC3Specific dac3;
   EC3Specific dec3;
   AC4Specific dac4;