From e04394f833d3382ee03c13e1c94af49f1d11ce7f Mon Sep 17 00:00:00 2001 From: Prabhat Roy Date: Mon, 28 Feb 2022 20:24:25 +0000 Subject: [PATCH 1/2] Improve test_video_reader --- test/test_video_reader.py | 1302 ++++++++++++++++++------------------- 1 file changed, 649 insertions(+), 653 deletions(-) diff --git a/test/test_video_reader.py b/test/test_video_reader.py index 73c4d8a1b85..6d556b9802e 100644 --- a/test/test_video_reader.py +++ b/test/test_video_reader.py @@ -1,5 +1,4 @@ import collections -import itertools import math import os from fractions import Fraction @@ -112,7 +111,7 @@ # av_seek_frame is imprecise so seek to a timestamp earlier by a margin # The unit of margin is second -seek_frame_margin = 0.25 +SEEK_FRAME_MARGIN = 0.25 def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer_size=4): @@ -369,7 +368,8 @@ def compare_decoding_result(self, tv_result, ref_result, config=all_check_config assert_equal(atimebase, ref_result.atimebase) - def test_stress_test_read_video_from_file(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_stress_test_read_video_from_file(self, test_video): pytest.skip( "This stress test will iteratively decode the same set of videos." "It helps to detect memory leak but it takes lots of time to run." @@ -386,52 +386,12 @@ def test_stress_test_read_video_from_file(self): audio_timebase_num, audio_timebase_den = 0, 1 for _i in range(num_iter): - for test_video, _config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - - # pass 1: decode all frames using new decoder - torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - def test_read_video_from_file(self): - """ - Test the case when decoder starts with a video file to decode frames. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) # pass 1: decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_file( + torch.ops.video_reader.read_video_from_file( full_path, - seek_frame_margin, + SEEK_FRAME_MARGIN, 0, # getPtsOnly 1, # readVideoStream width, @@ -450,14 +410,55 @@ def test_read_video_from_file(self): audio_timebase_num, audio_timebase_den, ) - # pass 2: decode all frames using av - pyav_result = _decode_frames_by_av_module(full_path) - # check results from TorchVision decoder - self.check_separate_decoding_result(tv_result, config) - # compare decoding results - self.compare_decoding_result(tv_result, pyav_result, config) - def test_read_video_from_file_read_single_stream_only(self): + @pytest.mark.parametrize("test_video,config", test_videos.items()) + def test_read_video_from_file(self, test_video, config): + """ + Test the case when decoder starts with a video file to decode frames. + """ + # video related + width, height, min_dimension, max_dimension = 0, 0, 0, 0 + video_start_pts, video_end_pts = 0, -1 + video_timebase_num, video_timebase_den = 0, 1 + # audio related + samples, channels = 0, 0 + audio_start_pts, audio_end_pts = 0, -1 + audio_timebase_num, audio_timebase_den = 0, 1 + + full_path = os.path.join(VIDEO_DIR, test_video) + + # pass 1: decode all frames using new decoder + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + # pass 2: decode all frames using av + pyav_result = _decode_frames_by_av_module(full_path) + # check results from TorchVision decoder + self.check_separate_decoding_result(tv_result, config) + # compare decoding results + self.compare_decoding_result(tv_result, pyav_result, config) + + @pytest.mark.parametrize("test_video,config", test_videos.items()) + @pytest.mark.parametrize("read_video_stream,read_audio_stream", [(1, 0), (0, 1)]) + def test_read_video_from_file_read_single_stream_only(self, test_video, config, read_video_stream, read_audio_stream): """ Test the case when decoder starts with a video file to decode frames, and only reads video stream and ignores audio stream @@ -471,57 +472,56 @@ def test_read_video_from_file_read_single_stream_only(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - for readVideoStream, readAudioStream in [(1, 0), (0, 1)]: - # decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - readVideoStream, - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - readAudioStream, - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - - assert (vframes.numel() > 0) is bool(readVideoStream) - assert (vframe_pts.numel() > 0) is bool(readVideoStream) - assert (vtimebase.numel() > 0) is bool(readVideoStream) - assert (vfps.numel() > 0) is bool(readVideoStream) - - expect_audio_data = readAudioStream == 1 and config.audio_sample_rate is not None - assert (aframes.numel() > 0) is bool(expect_audio_data) - assert (aframe_pts.numel() > 0) is bool(expect_audio_data) - assert (atimebase.numel() > 0) is bool(expect_audio_data) - assert (asample_rate.numel() > 0) is bool(expect_audio_data) - - def test_read_video_from_file_rescale_min_dimension(self): + full_path = os.path.join(VIDEO_DIR, test_video) + # decode all frames using new decoder + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + read_video_stream, + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + read_audio_stream, + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + + ( + vframes, + vframe_pts, + vtimebase, + vfps, + vduration, + aframes, + aframe_pts, + atimebase, + asample_rate, + aduration, + ) = tv_result + + assert (vframes.numel() > 0) is bool(read_video_stream) + assert (vframe_pts.numel() > 0) is bool(read_video_stream) + assert (vtimebase.numel() > 0) is bool(read_video_stream) + assert (vfps.numel() > 0) is bool(read_video_stream) + + expect_audio_data = read_audio_stream == 1 and config.audio_sample_rate is not None + assert (aframes.numel() > 0) is bool(expect_audio_data) + assert (aframe_pts.numel() > 0) is bool(expect_audio_data) + assert (atimebase.numel() > 0) is bool(expect_audio_data) + assert (asample_rate.numel() > 0) is bool(expect_audio_data) + + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_read_video_from_file_rescale_min_dimension(self, test_video): """ Test the case when decoder starts with a video file to decode frames, and video min dimension between height and width is set. @@ -535,33 +535,33 @@ def test_read_video_from_file_rescale_min_dimension(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, _config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) + full_path = os.path.join(VIDEO_DIR, test_video) + + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) - def test_read_video_from_file_rescale_max_dimension(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_read_video_from_file_rescale_max_dimension(self, test_video): """ Test the case when decoder starts with a video file to decode frames, and video min dimension between height and width is set. @@ -575,33 +575,33 @@ def test_read_video_from_file_rescale_max_dimension(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, _config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2)) + full_path = os.path.join(VIDEO_DIR, test_video) + + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2)) - def test_read_video_from_file_rescale_both_min_max_dimension(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_read_video_from_file_rescale_both_min_max_dimension(self, test_video): """ Test the case when decoder starts with a video file to decode frames, and video min dimension between height and width is set. @@ -615,34 +615,34 @@ def test_read_video_from_file_rescale_both_min_max_dimension(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, _config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) - assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2)) + full_path = os.path.join(VIDEO_DIR, test_video) + + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) + assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2)) - def test_read_video_from_file_rescale_width(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_read_video_from_file_rescale_width(self, test_video): """ Test the case when decoder starts with a video file to decode frames, and video width is set. @@ -656,33 +656,33 @@ def test_read_video_from_file_rescale_width(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, _config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(2) == width + full_path = os.path.join(VIDEO_DIR, test_video) + + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + assert tv_result[0].size(2) == width - def test_read_video_from_file_rescale_height(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_read_video_from_file_rescale_height(self, test_video): """ Test the case when decoder starts with a video file to decode frames, and video height is set. @@ -696,33 +696,33 @@ def test_read_video_from_file_rescale_height(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, _config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(1) == height + full_path = os.path.join(VIDEO_DIR, test_video) + + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + assert tv_result[0].size(1) == height - def test_read_video_from_file_rescale_width_and_height(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_read_video_from_file_rescale_width_and_height(self, test_video): """ Test the case when decoder starts with a video file to decode frames, and both video height and width are set. @@ -736,93 +736,92 @@ def test_read_video_from_file_rescale_width_and_height(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, _config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(1) == height - assert tv_result[0].size(2) == width + full_path = os.path.join(VIDEO_DIR, test_video) + + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + assert tv_result[0].size(1) == height + assert tv_result[0].size(2) == width - def test_read_video_from_file_audio_resampling(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + @pytest.mark.parametrize("samples", [9600, 96000]) + def test_read_video_from_file_audio_resampling(self, test_video, samples): """ Test the case when decoder starts with a video file to decode frames, and audio waveform are resampled """ + # video related + width, height, min_dimension, max_dimension = 0, 0, 0, 0 + video_start_pts, video_end_pts = 0, -1 + video_timebase_num, video_timebase_den = 0, 1 + # audio related + channels = 0 + audio_start_pts, audio_end_pts = 0, -1 + audio_timebase_num, audio_timebase_den = 0, 1 - for samples in [9600, 96000]: # downsampling # upsampling - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - channels = 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - for test_video, _config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - if aframes.numel() > 0: - assert samples == asample_rate.item() - assert 1 == aframes.size(1) - # when audio stream is found - duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1]) - assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item()) - - def test_compare_read_video_from_memory_and_file(self): + full_path = os.path.join(VIDEO_DIR, test_video) + + tv_result = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + ( + vframes, + vframe_pts, + vtimebase, + vfps, + vduration, + aframes, + aframe_pts, + atimebase, + asample_rate, + aduration, + ) = tv_result + if aframes.numel() > 0: + assert samples == asample_rate.item() + assert 1 == aframes.size(1) + # when audio stream is found + duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1]) + assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item()) + + @pytest.mark.parametrize("test_video,config", test_videos.items()) + def test_compare_read_video_from_memory_and_file(self, test_video, config): """ Test the case when video is already in memory, and decoder reads data in memory """ @@ -835,60 +834,60 @@ def test_compare_read_video_from_memory_and_file(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, config in test_videos.items(): - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result_memory = torch.ops.video_reader.read_video_from_memory( - video_tensor, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - self.check_separate_decoding_result(tv_result_memory, config) - # pass 2: decode all frames from file - tv_result_file = torch.ops.video_reader.read_video_from_file( - full_path, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) + full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) + + # pass 1: decode all frames using cpp decoder + tv_result_memory = torch.ops.video_reader.read_video_from_memory( + video_tensor, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + self.check_separate_decoding_result(tv_result_memory, config) + # pass 2: decode all frames from file + tv_result_file = torch.ops.video_reader.read_video_from_file( + full_path, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) - self.check_separate_decoding_result(tv_result_file, config) - # finally, compare results decoded from memory and file - self.compare_decoding_result(tv_result_memory, tv_result_file) + self.check_separate_decoding_result(tv_result_file, config) + # finally, compare results decoded from memory and file + self.compare_decoding_result(tv_result_memory, tv_result_file) - def test_read_video_from_memory(self): + @pytest.mark.parametrize("test_video,config", test_videos.items()) + def test_read_video_from_memory(self, test_video, config): """ Test the case when video is already in memory, and decoder reads data in memory """ @@ -901,38 +900,38 @@ def test_read_video_from_memory(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, config in test_videos.items(): - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - # pass 2: decode all frames using av - pyav_result = _decode_frames_by_av_module(full_path) + full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) + + # pass 1: decode all frames using cpp decoder + tv_result = torch.ops.video_reader.read_video_from_memory( + video_tensor, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + # pass 2: decode all frames using av + pyav_result = _decode_frames_by_av_module(full_path) - self.check_separate_decoding_result(tv_result, config) - self.compare_decoding_result(tv_result, pyav_result, config) + self.check_separate_decoding_result(tv_result, config) + self.compare_decoding_result(tv_result, pyav_result, config) - def test_read_video_from_memory_get_pts_only(self): + @pytest.mark.parametrize("test_video,config", test_videos.items()) + def test_read_video_from_memory_get_pts_only(self, test_video, config): """ Test the case when video is already in memory, and decoder reads data in memory. Compare frame pts between decoding for pts only and full decoding @@ -947,234 +946,234 @@ def test_read_video_from_memory_get_pts_only(self): audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1 - for test_video, config in test_videos.items(): - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert abs(config.video_fps - tv_result[3].item()) < 0.01 - - # pass 2: decode all frames to get PTS only using cpp decoder - tv_result_pts_only = torch.ops.video_reader.read_video_from_memory( - video_tensor, - seek_frame_margin, - 1, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) + _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) + + # pass 1: decode all frames using cpp decoder + tv_result = torch.ops.video_reader.read_video_from_memory( + video_tensor, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + assert abs(config.video_fps - tv_result[3].item()) < 0.01 + + # pass 2: decode all frames to get PTS only using cpp decoder + tv_result_pts_only = torch.ops.video_reader.read_video_from_memory( + video_tensor, + SEEK_FRAME_MARGIN, + 1, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) - assert not tv_result_pts_only[0].numel() - assert not tv_result_pts_only[5].numel() - self.compare_decoding_result(tv_result, tv_result_pts_only) + assert not tv_result_pts_only[0].numel() + assert not tv_result_pts_only[5].numel() + self.compare_decoding_result(tv_result, tv_result_pts_only) - def test_read_video_in_range_from_memory(self): + @pytest.mark.parametrize("test_video,config", test_videos.items()) + @pytest.mark.parametrize("num_frames", [4, 8, 16, 32, 64, 128]) + def test_read_video_in_range_from_memory(self, test_video, config, num_frames): """ Test the case when video is already in memory, and decoder reads data in memory. In addition, decoder takes meaningful start- and end PTS as input, and decode frames within that interval """ - for test_video, config in test_videos.items(): - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - # pass 1: decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, + full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) + # video related + width, height, min_dimension, max_dimension = 0, 0, 0, 0 + video_start_pts, video_end_pts = 0, -1 + video_timebase_num, video_timebase_den = 0, 1 + # audio related + samples, channels = 0, 0 + audio_start_pts, audio_end_pts = 0, -1 + audio_timebase_num, audio_timebase_den = 0, 1 + # pass 1: decode all frames using new decoder + tv_result = torch.ops.video_reader.read_video_from_memory( + video_tensor, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + ( + vframes, + vframe_pts, + vtimebase, + vfps, + vduration, + aframes, + aframe_pts, + atimebase, + asample_rate, + aduration, + ) = tv_result + assert abs(config.video_fps - vfps.item()) < 0.01 + + start_pts_ind_max = vframe_pts.size(0) - num_frames + if start_pts_ind_max <= 0: + return + # randomly pick start pts + start_pts_ind = randint(0, start_pts_ind_max) + end_pts_ind = start_pts_ind + num_frames - 1 + video_start_pts = vframe_pts[start_pts_ind] + video_end_pts = vframe_pts[end_pts_ind] + + video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1] + if len(atimebase) > 0: + # when audio stream is available + audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1] + audio_start_pts = _pts_convert( + video_start_pts.item(), + Fraction(video_timebase_num.item(), video_timebase_den.item()), + Fraction(audio_timebase_num.item(), audio_timebase_den.item()), + math.floor, + ) + audio_end_pts = _pts_convert( + video_end_pts.item(), + Fraction(video_timebase_num.item(), video_timebase_den.item()), + Fraction(audio_timebase_num.item(), audio_timebase_den.item()), + math.ceil, + ) + + # pass 2: decode frames in the randomly generated range + tv_result = torch.ops.video_reader.read_video_from_memory( + video_tensor, + SEEK_FRAME_MARGIN, + 0, # getPtsOnly + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + video_start_pts, + video_end_pts, + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + audio_start_pts, + audio_end_pts, + audio_timebase_num, + audio_timebase_den, + ) + + # pass 3: decode frames in range using PyAv + video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path) + + video_start_pts_av = _pts_convert( + video_start_pts.item(), + Fraction(video_timebase_num.item(), video_timebase_den.item()), + Fraction(video_timebase_av.numerator, video_timebase_av.denominator), + math.floor, + ) + video_end_pts_av = _pts_convert( + video_end_pts.item(), + Fraction(video_timebase_num.item(), video_timebase_den.item()), + Fraction(video_timebase_av.numerator, video_timebase_av.denominator), + math.ceil, + ) + if audio_timebase_av: + audio_start_pts = _pts_convert( + video_start_pts.item(), + Fraction(video_timebase_num.item(), video_timebase_den.item()), + Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator), + math.floor, + ) + audio_end_pts = _pts_convert( + video_end_pts.item(), + Fraction(video_timebase_num.item(), video_timebase_den.item()), + Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator), + math.ceil, ) - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - assert abs(config.video_fps - vfps.item()) < 0.01 - - for num_frames in [4, 8, 16, 32, 64, 128]: - start_pts_ind_max = vframe_pts.size(0) - num_frames - if start_pts_ind_max <= 0: - continue - # randomly pick start pts - start_pts_ind = randint(0, start_pts_ind_max) - end_pts_ind = start_pts_ind + num_frames - 1 - video_start_pts = vframe_pts[start_pts_ind] - video_end_pts = vframe_pts[end_pts_ind] - - video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1] - if len(atimebase) > 0: - # when audio stream is available - audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1] - audio_start_pts = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_num.item(), audio_timebase_den.item()), - math.floor, - ) - audio_end_pts = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_num.item(), audio_timebase_den.item()), - math.ceil, - ) - - # pass 2: decode frames in the randomly generated range - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - seek_frame_margin, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - # pass 3: decode frames in range using PyAv - video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path) - - video_start_pts_av = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(video_timebase_av.numerator, video_timebase_av.denominator), - math.floor, - ) - video_end_pts_av = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(video_timebase_av.numerator, video_timebase_av.denominator), - math.ceil, - ) - if audio_timebase_av: - audio_start_pts = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator), - math.floor, - ) - audio_end_pts = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator), - math.ceil, - ) - - pyav_result = _decode_frames_by_av_module( - full_path, - video_start_pts_av, - video_end_pts_av, - audio_start_pts, - audio_end_pts, - ) - - assert tv_result[0].size(0) == num_frames - if pyav_result.vframes.size(0) == num_frames: - # if PyAv decodes a different number of video frames, skip - # comparing the decoding results between Torchvision video reader - # and PyAv - self.compare_decoding_result(tv_result, pyav_result, config) - - def test_probe_video_from_file(self): + + pyav_result = _decode_frames_by_av_module( + full_path, + video_start_pts_av, + video_end_pts_av, + audio_start_pts, + audio_end_pts, + ) + + assert tv_result[0].size(0) == num_frames + if pyav_result.vframes.size(0) == num_frames: + # if PyAv decodes a different number of video frames, skip + # comparing the decoding results between Torchvision video reader + # and PyAv + self.compare_decoding_result(tv_result, pyav_result, config) + + @pytest.mark.parametrize("test_video,config", test_videos.items()) + def test_probe_video_from_file(self, test_video, config): """ Test the case when decoder probes a video file """ - for test_video, config in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - probe_result = torch.ops.video_reader.probe_video_from_file(full_path) - self.check_probe_result(probe_result, config) + full_path = os.path.join(VIDEO_DIR, test_video) + probe_result = torch.ops.video_reader.probe_video_from_file(full_path) + self.check_probe_result(probe_result, config) - def test_probe_video_from_memory(self): + @pytest.mark.parametrize("test_video,config", test_videos.items()) + def test_probe_video_from_memory(self, test_video, config): """ Test the case when decoder probes a video in memory """ - for test_video, config in test_videos.items(): - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor) - self.check_probe_result(probe_result, config) + _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) + probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor) + self.check_probe_result(probe_result, config) - def test_probe_video_from_memory_script(self): + @pytest.mark.parametrize("test_video,config", test_videos.items()) + def test_probe_video_from_memory_script(self, test_video, config): scripted_fun = torch.jit.script(io._probe_video_from_memory) assert scripted_fun is not None - for test_video, config in test_videos.items(): - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - probe_result = scripted_fun(video_tensor) - self.check_meta_result(probe_result, config) + _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) + probe_result = scripted_fun(video_tensor) + self.check_meta_result(probe_result, config) - def test_read_video_from_memory_scripted(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_read_video_from_memory_scripted(self, test_video): """ Test the case when video is already in memory, and decoder reads data in memory """ @@ -1190,29 +1189,28 @@ def test_read_video_from_memory_scripted(self): scripted_fun = torch.jit.script(io._read_video_from_memory) assert scripted_fun is not None - for test_video, _config in test_videos.items(): - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # decode all frames using cpp decoder - scripted_fun( - video_tensor, - seek_frame_margin, - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - [video_start_pts, video_end_pts], - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - [audio_start_pts, audio_end_pts], - audio_timebase_num, - audio_timebase_den, - ) - # FUTURE: check value of video / audio frames + _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) + + # decode all frames using cpp decoder + scripted_fun( + video_tensor, + SEEK_FRAME_MARGIN, + 1, # readVideoStream + width, + height, + min_dimension, + max_dimension, + [video_start_pts, video_end_pts], + video_timebase_num, + video_timebase_den, + 1, # readAudioStream + samples, + channels, + [audio_start_pts, audio_end_pts], + audio_timebase_num, + audio_timebase_den, + ) + # FUTURE: check value of video / audio frames def test_invalid_file(self): set_video_backend("video_reader") @@ -1223,33 +1221,31 @@ def test_invalid_file(self): with pytest.raises(RuntimeError): io.read_video("foo.mp4") - def test_audio_present_pts(self): + @pytest.mark.parametrize("test_video", test_videos.keys()) + @pytest.mark.parametrize("backend", ["video_reader", "pyav"]) + @pytest.mark.parametrize("start_offset", [0, 1000]) + @pytest.mark.parametrize("end_offset", [3000, None]) + def test_audio_present_pts(self, test_video, backend, start_offset, end_offset): """Test if audio frames are returned with pts unit.""" - backends = ["video_reader", "pyav"] - start_offsets = [0, 1000] - end_offsets = [3000, None] - for test_video, _ in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - container = av.open(full_path) - if container.streams.audio: - for backend, start_offset, end_offset in itertools.product(backends, start_offsets, end_offsets): - set_video_backend(backend) - _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts") - assert all([dimension > 0 for dimension in audio.shape[:2]]) - - def test_audio_present_sec(self): + full_path = os.path.join(VIDEO_DIR, test_video) + container = av.open(full_path) + if container.streams.audio: + set_video_backend(backend) + _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts") + assert all([dimension > 0 for dimension in audio.shape[:2]]) + + @pytest.mark.parametrize("test_video", test_videos.keys()) + @pytest.mark.parametrize("backend", ["video_reader", "pyav"]) + @pytest.mark.parametrize("start_offset", [0, 0.1]) + @pytest.mark.parametrize("end_offset", [0.3, None]) + def test_audio_present_sec(self, test_video, backend, start_offset, end_offset): """Test if audio frames are returned with sec unit.""" - backends = ["video_reader", "pyav"] - start_offsets = [0, 0.1] - end_offsets = [0.3, None] - for test_video, _ in test_videos.items(): - full_path = os.path.join(VIDEO_DIR, test_video) - container = av.open(full_path) - if container.streams.audio: - for backend, start_offset, end_offset in itertools.product(backends, start_offsets, end_offsets): - set_video_backend(backend) - _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec") - assert all([dimension > 0 for dimension in audio.shape[:2]]) + full_path = os.path.join(VIDEO_DIR, test_video) + container = av.open(full_path) + if container.streams.audio: + set_video_backend(backend) + _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec") + assert all([dimension > 0 for dimension in audio.shape[:2]]) if __name__ == "__main__": From 04bee90d7a5547e1fb60c35b99abe4dd3d015bf2 Mon Sep 17 00:00:00 2001 From: Prabhat Roy Date: Mon, 28 Feb 2022 22:12:46 +0000 Subject: [PATCH 2/2] Fix linter error --- test/test_video_reader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_video_reader.py b/test/test_video_reader.py index 6d556b9802e..c3b0487f153 100644 --- a/test/test_video_reader.py +++ b/test/test_video_reader.py @@ -458,7 +458,9 @@ def test_read_video_from_file(self, test_video, config): @pytest.mark.parametrize("test_video,config", test_videos.items()) @pytest.mark.parametrize("read_video_stream,read_audio_stream", [(1, 0), (0, 1)]) - def test_read_video_from_file_read_single_stream_only(self, test_video, config, read_video_stream, read_audio_stream): + def test_read_video_from_file_read_single_stream_only( + self, test_video, config, read_video_stream, read_audio_stream + ): """ Test the case when decoder starts with a video file to decode frames, and only reads video stream and ignores audio stream