src/h264.rs

// This file is part of Moonfire NVR, a security camera network video recorder.
// Copyright (C) 2016-2020 The Moonfire NVR Authors
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// In addition, as a special exception, the copyright holders give
// permission to link the code of portions of this program with the
// OpenSSL library under certain conditions as described in each
// individual source file, and distribute linked combinations including
// the two.
//
// You must obey the GNU General Public License in all respects for all
// of the code used other than OpenSSL. If you modify file(s) with this
// exception, you may extend this exception to your version of the
// file(s), but you are not obligated to do so. If you do not wish to do
// so, delete this exception statement from your version. If you delete
// this exception statement from all source files in the program, then
// also delete it here.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//! H.264 decoding
//!
//! For the most part, Moonfire NVR does not try to understand the video codec. However, H.264 has
//! two byte stream encodings: ISO/IEC 14496-10 Annex B, and ISO/IEC 14496-15 AVC access units.
//! When streaming from RTSP, ffmpeg supplies the former. We need the latter to stick into `.mp4`
//! files. This file manages the conversion, both for the ffmpeg "extra data" (which should become
//! the ISO/IEC 14496-15 section 5.2.4.1 `AVCDecoderConfigurationRecord`) and the actual samples.
//!
//! ffmpeg of course has logic to do the same thing, but unfortunately it is not exposed except
//! through ffmpeg's own generated `.mp4` file. Extracting just this part of their `.mp4` files
//! would be more trouble than it's worth.

use byteorder::{BigEndian, ByteOrder, WriteBytesExt};
use failure::{Error, bail, format_err};
use std::convert::TryFrom;

// See ISO/IEC 14496-10 table 7-1 - NAL unit type codes, syntax element categories, and NAL unit
// type classes.
const NAL_UNIT_SEQ_PARAMETER_SET: u8 = 7;
const NAL_UNIT_PIC_PARAMETER_SET: u8 = 8;

const NAL_UNIT_TYPE_MASK: u8 = 0x1F;  // bottom 5 bits of first byte of unit.

// For certain common sub stream anamorphic resolutions, add a pixel aspect ratio box.
const PIXEL_ASPECT_RATIOS: [((u16, u16), (u16, u16)); 4] = [
    ((320, 240), ( 4,  3)),
    ((352, 240), (40, 33)),
    ((640, 480), ( 4,  3)),
    ((704, 480), (40, 33)),
];

/// Get the pixel aspect ratio to use if none is specified.
///
/// The Dahua IPC-HDW5231R-Z sets the aspect ratio in the H.264 SPS (correctly) for both square and
/// non-square pixels. The Hikvision DS-2CD2032-I doesn't set it, even though the sub stream's
/// pixels aren't square. So define a default based on the pixel dimensions to use if the camera
/// doesn't tell us what to do.
///
/// Note that at least in the case of .mp4 muxing, we don't need to fix up the underlying SPS.
/// SPS; PixelAspectRatioBox's definition says that it overrides the H.264-level declaration.
fn default_pixel_aspect_ratio(width: u16, height: u16) -> (u16, u16) {
    let dims = (width, height);
    for r in &PIXEL_ASPECT_RATIOS {
        if r.0 == dims {
            return r.1;
        }
    }
    (1, 1)
}

/// Decodes a H.264 Annex B byte stream into NAL units. Calls `f` for each NAL unit in the byte
/// stream. Aborts if `f` returns error.
///
/// Note `f` is called with the encoded NAL form, not the RBSP. The NAL header byte and any
/// emulation prevention bytes will be present.
///
/// See ISO/IEC 14496-10 section B.2: Byte stream NAL unit decoding process.
/// This is a relatively simple, unoptimized implementation.
///
/// TODO: detect invalid byte streams. For example, several 0x00s not followed by a 0x01, a stream
/// stream not starting with 0x00 0x00 0x00 0x01, or an empty NAL unit.
fn decode_h264_annex_b<'a, F>(mut data: &'a [u8], mut f: F) -> Result<(), Error>
where F: FnMut(&'a [u8]) -> Result<(), Error> {
    let start_code = &b"\x00\x00\x01"[..];
    use nom::FindSubstring;
    'outer: while let Some(pos) = data.find_substring(start_code) {
        let mut unit = &data[0..pos];
        data = &data[pos + start_code.len() ..];
        // Have zero or more bytes that end in a start code. Strip out any trailing 0x00s and
        // process the unit if there's anything left.
        loop {
            match unit.last() {
                None => continue 'outer,
                Some(b) if *b == 0 => { unit = &unit[..unit.len()-1]; },
                Some(_) => break,
            }
        }
        f(unit)?;
    }

    // No remaining start codes; likely a unit left.
    if !data.is_empty() {
        f(data)?;
    }
    Ok(())
}

/// Parses Annex B extra data, returning a tuple holding the `sps` and `pps` substrings.
fn parse_annex_b_extra_data(data: &[u8]) -> Result<(&[u8], &[u8]), Error> {
    let mut sps = None;
    let mut pps = None;
    decode_h264_annex_b(data, |unit| {
        let nal_type = (unit[0] as u8) & NAL_UNIT_TYPE_MASK;
        match nal_type {
            NAL_UNIT_SEQ_PARAMETER_SET => sps = Some(unit),
            NAL_UNIT_PIC_PARAMETER_SET => pps = Some(unit),
            _ => bail!("Expected SPS and PPS; got type {}", nal_type),
        };
        Ok(())
    })?;
    match (sps, pps) {
        (Some(s), Some(p)) => Ok((s, p)),
        _ => bail!("SPS and PPS must be specified"),
    }
}

/// Decodes a NAL unit (minus header byte) into its RBSP.
/// Stolen from h264-reader's src/avcc.rs. This shouldn't last long, see:
/// <https://github.com/dholroyd/h264-reader/issues/4>.
fn decode(encoded: &[u8]) -> Vec<u8> {
    struct NalRead(Vec<u8>);
    use h264_reader::Context;
    use h264_reader::nal::NalHandler;
    impl NalHandler for NalRead {
        type Ctx = ();
        fn start(&mut self, _ctx: &mut Context<Self::Ctx>, _header: h264_reader::nal::NalHeader) {}

        fn push(&mut self, _ctx: &mut Context<Self::Ctx>, buf: &[u8]) {
            self.0.extend_from_slice(buf)
        }

        fn end(&mut self, _ctx: &mut Context<Self::Ctx>) {}
    }
    let mut decode = h264_reader::rbsp::RbspDecoder::new(NalRead(vec![]));
    let mut ctx = Context::new(());
    decode.push(&mut ctx, encoded);
    let read = decode.into_handler();
    read.0
}

/// Parsed representation of ffmpeg's "extradata".
#[derive(Debug, PartialEq, Eq)]
pub struct ExtraData {
    pub entry: db::VideoSampleEntryToInsert,

    /// True iff sample data should be transformed from Annex B format to AVC format via a call to
    /// `transform_sample_data`. (The assumption is that if the extra data was in Annex B format,
    /// the sample data is also.)
    pub need_transform: bool,
}

impl ExtraData {
    /// Parses "extradata" from ffmpeg. This data may be in either Annex B format or AVC format.
    pub fn parse(extradata: &[u8], width: u16, height: u16) -> Result<ExtraData, Error> {
        let raw_sps_and_pps;
        let need_transform;
        let ctx;
        let sps_owner;
        let sps; // reference to either within ctx or to sps_owner.
        if extradata.starts_with(b"\x00\x00\x00\x01") ||
                          extradata.starts_with(b"\x00\x00\x01") {
            // ffmpeg supplied "extradata" in Annex B format.
            let (s, p) = parse_annex_b_extra_data(extradata)?;
            let rbsp = decode(&s[1..]);
            sps_owner = h264_reader::nal::sps::SeqParameterSet::from_bytes(&rbsp)
                .map_err(|e| format_err!("Bad SPS: {:?}", e))?;
            sps = &sps_owner;
            raw_sps_and_pps = Some((s, p));
            need_transform = true;
        } else {
            // Assume "extradata" holds an AVCDecoderConfiguration.
            need_transform = false;
            raw_sps_and_pps = None;
            let avcc = h264_reader::avcc::AvcDecoderConfigurationRecord::try_from(extradata)
                .map_err(|e| format_err!("Bad AvcDecoderConfigurationRecord: {:?}", e))?;
            if avcc.num_of_sequence_parameter_sets() != 1 {
                bail!("Multiple SPSs!");
            }
            ctx = avcc.create_context(())
                .map_err(|e| format_err!("Can't load SPS+PPS: {:?}", e))?;
            sps = ctx.sps_by_id(h264_reader::nal::pps::ParamSetId::from_u32(0).unwrap())
                .ok_or_else(|| format_err!("No SPS 0"))?;
        };

        let mut sample_entry = Vec::with_capacity(256);

        // This is a concatenation of the following boxes/classes.

        // SampleEntry, ISO/IEC 14496-12 section 8.5.2.
        let avc1_len_pos = sample_entry.len();
        // length placeholder + type + reserved + data_reference_index = 1
        sample_entry.extend_from_slice(b"\x00\x00\x00\x00avc1\x00\x00\x00\x00\x00\x00\x00\x01");

        // VisualSampleEntry, ISO/IEC 14496-12 section 12.1.3.
        sample_entry.extend_from_slice(&[0; 16]);  // pre-defined + reserved
        sample_entry.write_u16::<BigEndian>(width)?;
        sample_entry.write_u16::<BigEndian>(height)?;
        sample_entry.extend_from_slice(&[
                0x00, 0x48, 0x00, 0x00,  // horizresolution
                0x00, 0x48, 0x00, 0x00,  // vertresolution
                0x00, 0x00, 0x00, 0x00,  // reserved
                0x00, 0x01,              // frame count
                0x00, 0x00, 0x00, 0x00,  // compressorname
                0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00,
                0x00, 0x18, 0xff, 0xff,  // depth + pre_defined
        ]);

        // AVCSampleEntry, ISO/IEC 14496-15 section 5.3.4.1.
        // AVCConfigurationBox, ISO/IEC 14496-15 section 5.3.4.1.
        let avcc_len_pos = sample_entry.len();
        sample_entry.extend_from_slice(b"\x00\x00\x00\x00avcC");

        if let Some((sps, pps)) = raw_sps_and_pps {
            // Create the AVCDecoderConfiguration, ISO/IEC 14496-15 section 5.2.4.1.
            // The beginning of the AVCDecoderConfiguration takes a few values from
            // the SPS (ISO/IEC 14496-10 section 7.3.2.1.1). One caveat: that section
            // defines the syntax in terms of RBSP, not NAL. The difference is the
            // escaping of 00 00 01 and 00 00 02; see notes about
            // "emulation_prevention_three_byte" in ISO/IEC 14496-10 section 7.4.
            // It looks like 00 is not a valid value of profile_idc, so this distinction
            // shouldn't be relevant here. And ffmpeg seems to ignore it.
            sample_entry.push(1);       // configurationVersion
            sample_entry.push(sps[1]);  // profile_idc . AVCProfileIndication
            sample_entry.push(sps[2]);  // ...misc bits... . profile_compatibility
            sample_entry.push(sps[3]);  // level_idc . AVCLevelIndication

            // Hardcode lengthSizeMinusOne to 3, matching TransformSampleData's 4-byte
            // lengths.
            sample_entry.push(0xff);

            // Only support one SPS and PPS.
            // ffmpeg's ff_isom_write_avcc has the same limitation, so it's probably
            // fine. This next byte is a reserved 0b111 + a 5-bit # of SPSs (1).
            sample_entry.push(0xe1);
            sample_entry.write_u16::<BigEndian>(u16::try_from(sps.len())?)?;
            sample_entry.extend_from_slice(sps);
            sample_entry.push(1);  // # of PPSs.
            sample_entry.write_u16::<BigEndian>(u16::try_from(pps.len())?)?;
            sample_entry.extend_from_slice(pps);

        } else {
            sample_entry.extend_from_slice(extradata);
        };

        // Fix up avc1 and avcC box lengths.
        let cur_pos = sample_entry.len();
        BigEndian::write_u32(&mut sample_entry[avcc_len_pos .. avcc_len_pos + 4],
                             u32::try_from(cur_pos - avcc_len_pos)?);

        // PixelAspectRatioBox, ISO/IEC 14496-12 section 12.1.4.2.
        // Write a PixelAspectRatioBox if necessary, as the sub streams can be be anamorphic.
        let pasp = sps.vui_parameters.as_ref()
                                     .and_then(|v| v.aspect_ratio_info.as_ref())
                                     .and_then(|a| a.clone().get())
                                     .unwrap_or_else(|| default_pixel_aspect_ratio(width, height));
        if pasp != (1, 1) {
            sample_entry.extend_from_slice(b"\x00\x00\x00\x10pasp");  // length + box name
            sample_entry.write_u32::<BigEndian>(pasp.0.into())?;
            sample_entry.write_u32::<BigEndian>(pasp.1.into())?;
        }

        let cur_pos = sample_entry.len();
        BigEndian::write_u32(&mut sample_entry[avc1_len_pos .. avc1_len_pos + 4],
                             u32::try_from(cur_pos - avc1_len_pos)?);

        let profile_idc = sample_entry[103];
        let constraint_flags = sample_entry[104];
        let level_idc = sample_entry[105];

        let rfc6381_codec =
            format!("avc1.{:02x}{:02x}{:02x}", profile_idc, constraint_flags, level_idc);
        Ok(ExtraData {
            entry: db::VideoSampleEntryToInsert {
                data: sample_entry,
                rfc6381_codec,
                width,
                height,
                pasp_h_spacing: pasp.0,
                pasp_v_spacing: pasp.1,
            },
            need_transform,
        })
    }
}

/// Transforms sample data from Annex B format to AVC format. Should be called on samples iff
/// `ExtraData::need_transform` is true. Uses an out parameter `avc_sample` rather than a return
/// so that memory allocations can be reused from sample to sample.
pub fn transform_sample_data(annexb_sample: &[u8], avc_sample: &mut Vec<u8>) -> Result<(), Error> {
    // See AVCParameterSamples, ISO/IEC 14496-15 section 5.3.2.
    avc_sample.clear();

    // The output will be about as long as the input. Annex B stop codes require at least three
    // bytes; many seem to be four. The output lengths are exactly four.
    avc_sample.reserve(annexb_sample.len() + 4);
    decode_h264_annex_b(annexb_sample, |unit| {
        // 4-byte length; this must match ParseExtraData's lengthSizeMinusOne == 3.
        avc_sample.write_u32::<BigEndian>(unit.len() as u32)?;  // length
        avc_sample.extend_from_slice(unit);
        Ok(())
    })?;
    Ok(())
}

#[cfg(test)]
mod tests {
    use db::testutil;

    const ANNEX_B_TEST_INPUT: [u8; 35] = [
        0x00, 0x00, 0x00, 0x01, 0x67, 0x4d, 0x00, 0x1f,
        0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff, 0x35, 0x01,
        0x01, 0x01, 0x40, 0x00, 0x00, 0xfa, 0x00, 0x00,
        0x1d, 0x4c, 0x01, 0x00, 0x00, 0x00, 0x01, 0x68,
        0xee, 0x3c, 0x80,
    ];

    const AVC_DECODER_CONFIG_TEST_INPUT: [u8; 38] = [
        0x01, 0x4d, 0x00, 0x1f, 0xff, 0xe1, 0x00, 0x17,
        0x67, 0x4d, 0x00, 0x1f, 0x9a, 0x66, 0x02, 0x80,
        0x2d, 0xff, 0x35, 0x01, 0x01, 0x01, 0x40, 0x00,
        0x00, 0xfa, 0x00, 0x00, 0x1d, 0x4c, 0x01, 0x01,
        0x00, 0x04, 0x68, 0xee, 0x3c, 0x80,
    ];

    const TEST_OUTPUT: [u8; 132] = [
        0x00, 0x00, 0x00, 0x84, 0x61, 0x76, 0x63, 0x31,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x05, 0x00, 0x02, 0xd0, 0x00, 0x48, 0x00, 0x00,
        0x00, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x18, 0xff, 0xff, 0x00, 0x00,
        0x00, 0x2e, 0x61, 0x76, 0x63, 0x43, 0x01, 0x4d,
        0x00, 0x1f, 0xff, 0xe1, 0x00, 0x17, 0x67, 0x4d,
        0x00, 0x1f, 0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff,
        0x35, 0x01, 0x01, 0x01, 0x40, 0x00, 0x00, 0xfa,
        0x00, 0x00, 0x1d, 0x4c, 0x01, 0x01, 0x00, 0x04,
        0x68, 0xee, 0x3c, 0x80,
    ];

    #[test]
    fn test_decode() {
        testutil::init();
        let data = &ANNEX_B_TEST_INPUT;
        let mut pieces = Vec::new();
        super::decode_h264_annex_b(data, |p| {
            pieces.push(p);
            Ok(())
        }).unwrap();
        assert_eq!(&pieces, &[&data[4 .. 27], &data[31 ..]]);
    }

    #[test]
    fn test_sample_entry_from_avc_decoder_config() {
        testutil::init();
        let e = super::ExtraData::parse(&AVC_DECODER_CONFIG_TEST_INPUT, 1280, 720).unwrap();
        assert_eq!(&e.entry.data[..], &TEST_OUTPUT[..]);
        assert_eq!(e.entry.width, 1280);
        assert_eq!(e.entry.height, 720);
        assert_eq!(e.entry.rfc6381_codec, "avc1.4d001f");
        assert_eq!(e.need_transform, false);
    }

    #[test]
    fn test_sample_entry_from_annex_b() {
        testutil::init();
        let e = super::ExtraData::parse(&ANNEX_B_TEST_INPUT, 1280, 720).unwrap();
        assert_eq!(e.entry.width, 1280);
        assert_eq!(e.entry.height, 720);
        assert_eq!(e.entry.rfc6381_codec, "avc1.4d001f");
        assert_eq!(e.need_transform, true);
    }

    #[test]
    fn test_transform_sample_data() {
        testutil::init();
        const INPUT: [u8; 64] = [
            0x00, 0x00, 0x00, 0x01, 0x67, 0x4d, 0x00, 0x1f,
            0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff, 0x35, 0x01,
            0x01, 0x01, 0x40, 0x00, 0x00, 0xfa, 0x00, 0x00,
            0x1d, 0x4c, 0x01,

            0x00, 0x00, 0x00, 0x01, 0x68, 0xee, 0x3c, 0x80,

            0x00, 0x00, 0x00, 0x01, 0x06, 0x06, 0x01, 0xc4,
            0x80,

            0x00, 0x00, 0x00, 0x01, 0x65, 0x88, 0x80, 0x10,
            0x00, 0x08, 0x7f, 0x00, 0x5d, 0x27, 0xb5, 0xc1,
            0xff, 0x8c, 0xd6, 0x35,
            // (truncated)
        ];
        const EXPECTED_OUTPUT: [u8; 64] = [
            0x00, 0x00, 0x00, 0x17, 0x67, 0x4d, 0x00, 0x1f,
            0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff, 0x35, 0x01,
            0x01, 0x01, 0x40, 0x00, 0x00, 0xfa, 0x00, 0x00,
            0x1d, 0x4c, 0x01,

            0x00, 0x00, 0x00, 0x04, 0x68, 0xee, 0x3c, 0x80,

            0x00, 0x00, 0x00, 0x05, 0x06, 0x06, 0x01, 0xc4,
            0x80,

            0x00, 0x00, 0x00, 0x10, 0x65, 0x88, 0x80, 0x10,
            0x00, 0x08, 0x7f, 0x00, 0x5d, 0x27, 0xb5, 0xc1,
            0xff, 0x8c, 0xd6, 0x35,
        ];
        let mut out = Vec::new();
        super::transform_sample_data(&INPUT, &mut out).unwrap();
        assert_eq!(&out[..], &EXPECTED_OUTPUT[..]);
    }
}