Yt live from start range (#2)

* [utils] Add hackish 'now' support for --download-sections * [utils] Add microseconds to unified_timestamp * [common] Extract start and end keys for Dash fragments * [utils] Allow using local timezone for 'now' timestamps * Use local timezone for download sections * Add fixme in modified parse_chapters function A range like '*(now-1hour)-(now-30minutes)' doesn't work * [youtube] Support --download-sections for YT Livestream from start * Create last_segment_url only if necessary * Improve parse_chapters comments * Fix linter * [extractor/iq] Set more language codes (yt-dlp#6476) Authored by: D0LLYNH0 * [extractor/opencast] Add ltitools to `_VALID_URL` (yt-dlp#6371) Authored by: C0D3D3V * [downloader/curl] Fix progress reporting Bug in 8c53322 Closes yt-dlp#6490 * [extractor/youtube] Bypass throttling for `-f17` and related cleanup Thanks @AudricV for the finding * [extractor/twitch] Fix `is_live` (yt-dlp#6500) Closes yt-dlp#6494 Authored by: elyse0 * [extractor/cbc:gem] Update `_VALID_URL` (yt-dlp#6499) Authored by: makeworld-the-better-one Closes yt-dlp#6395 * Support loading info.json with a list at it's root * [extractor/hidive] Fix login Fixes yt-dlp#6493 (comment) * [extractor/opencast] Fix format bug (yt-dlp#6512) Authored by: C0D3D3V * [extractor/rokfin] Re-construct manifest url (yt-dlp#6507) Authored by: vampirefrog * [extractor/youtube] Add client name to `format_note` when `-v` (yt-dlp#6254) Authored by: Lesmiscore, pukkandan * [extractor/youtube] Add extractor-arg `include_duplicate_formats` * [extractor/youtube] Construct fragment list lazily Building fragment list for all formats take significant time for large videos * Support negative durations * Revert "[utils] Allow using local timezone for 'now' timestamps" This reverts commit 1799a6a. * Add fragment count * Fix unified_timestamp * Remove tz_aware date code * Add debug for selected section * Add initial documentation * Fix linter * Fix linter * Allow days in parse_duration * Improve option documentation * Add some documentation * Lock less agressively This gives a speed performance of about 30% * Fix return values of _extract_sequence_from_mpd * Always compute last_seq * Support for epoch timestamps * Update options docs * Restore README.md I think this is auto-generated by some script * Add warning about --download-sections without --live-from-start * Fix bug after merge * Update yt_dlp/options.py * Cleanup --------- Co-authored-by: Elyse <26639800+elyse0@users.noreply.github.com> Co-authored-by: Sophire <115919609+sophie0x@users.noreply.github.com> Co-authored-by: D0LLYNH0 <67797325+D0LLYNH0@users.noreply.github.com> Co-authored-by: Daniel Vogt <daniel-vogt@mail.de> Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com> Co-authored-by: makeworld <25111343+makeworld-the-better-one@users.noreply.github.com> Co-authored-by: Daniel Vogt <c0d3d3v@mag-keinen-spam.de> Co-authored-by: vampirefrog <vampirefrog@users.noreply.github.com> Co-authored-by: Lesmiscore <nao20010128@gmail.com> Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> Co-authored-by: bashonly <bashonly@bashonly.com>
qa4FKm3mUr · Jul 30, 2023 · 610b834 · 610b834
1 parent 6014355
commit 610b834
Show file tree

Hide file tree

Showing 8 changed files with 112 additions and 34 deletions.
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -413,10 +413,15 @@ def test_unified_timestamps(self):
         self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
         self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
         self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
+        self.assertEqual(unified_timestamp('2022-10-13T02:37:47.831Z'), 1665628667)
 
         self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1)
         self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86)
         self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78)
+        self.assertEqual(unified_timestamp('2023-03-09T18:01:33.646Z', with_milliseconds=True), 1678384893.646)
+        # ISO8601 spec says that if no timezone is specified, we should use local timezone;
+        # but yt-dlp uses UTC to keep things consistent
+        self.assertEqual(unified_timestamp('2023-03-11T06:48:34.008'), 1678517314)
 
     def test_determine_ext(self):
         self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')

diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
@@ -27,7 +27,12 @@
 from .compat import functools, urllib  # isort: split
 from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req
 from .cookies import LenientSimpleCookie, load_cookies
-from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
+from .downloader import (
+    DashSegmentsFD,
+    FFmpegFD,
+    get_suitable_downloader,
+    shorten_protocol_name,
+)
 from .downloader.rtmp import rtmpdump_version
 from .extractor import gen_extractor_classes, get_info_extractor
 from .extractor.common import UnsupportedURLIE
@@ -3289,7 +3294,7 @@ def existing_video_file(*filepaths):
                 fd, success = None, True
                 if info_dict.get('protocol') or info_dict.get('url'):
                     fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
-                    if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and (
+                    if fd not in [FFmpegFD, DashSegmentsFD] and 'no-direct-merge' not in self.params['compat_opts'] and (
                             info_dict.get('section_start') or info_dict.get('section_end')):
                         msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
                                else 'You have requested downloading the video partially, but ffmpeg is not installed')

diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py
@@ -13,6 +13,7 @@
 import os
 import re
 import sys
+import time
 import traceback
 
 from .compat import compat_shlex_quote
@@ -328,12 +329,13 @@ def parse_chapters(name, value, advanced=False):
             (?P<end_sign>-?)(?P<end>[^-]+)
         )?'''
 
+        current_time = time.time()
         chapters, ranges, from_url = [], [], False
         for regex in value or []:
             if advanced and regex == '*from-url':
                 from_url = True
                 continue
-            elif not regex.startswith('*'):
+            elif not regex.startswith('*') and not regex.startswith('#'):
                 try:
                     chapters.append(re.compile(regex))
                 except re.error as err:
@@ -350,11 +352,16 @@ def parse_chapters(name, value, advanced=False):
                     err = 'Must be of the form "*start-end"'
                 elif not advanced and any(signs):
                     err = 'Negative timestamps are not allowed'
-                else:
+                elif regex.startswith('*'):
                     dur[0] *= -1 if signs[0] else 1
                     dur[1] *= -1 if signs[1] else 1
                     if dur[1] == float('-inf'):
                         err = '"-inf" is not a valid end'
+                elif regex.startswith('#'):
+                    dur[0] = dur[0] * (-1 if signs[0] else 1) + current_time
+                    dur[1] = dur[1] * (-1 if signs[1] else 1) + current_time
+                    if dur[1] == float('-inf'):
+                        err = '"-inf" is not a valid end'
                 if err:
                     raise ValueError(f'invalid {name} time range "{regex}". {err}')
                 ranges.append(dur)

diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py
@@ -33,6 +33,8 @@ def real_download(self, filename, info_dict):
                 'filename': fmt.get('filepath') or filename,
                 'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'),
                 'total_frags': fragment_count,
+                'section_start': info_dict.get('section_start'),
+                'section_end': info_dict.get('section_end'),
             }
 
             if real_downloader:

diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
@@ -2592,7 +2592,7 @@ def extract_common(source):
                             r = int(s.get('r', 0))
                             ms_info['total_number'] += 1 + r
                             ms_info['s'].append({
-                                't': int(s.get('t', 0)),
+                                't': int_or_none(s.get('t')),
                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
                                 'd': int(s.attrib['d']),
                                 'r': r,
@@ -2634,9 +2634,16 @@ def extract_Initialization(source):
             return ms_info
 
         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
+        availability_start_time = unified_timestamp(
+            mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0
         formats, subtitles = [], {}
         stream_numbers = collections.defaultdict(int)
         for period in mpd_doc.findall(_add_ns('Period')):
+            # segmentIngestTime is completely out of spec, but YT Livestream do this
+            segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime')
+            if segment_ingest_time:
+                availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True)
+
             period_duration = parse_duration(period.get('duration')) or mpd_duration
             period_ms_info = extract_multisegment_info(period, {
                 'start_number': 1,
@@ -2810,13 +2817,17 @@ def add_segment_url():
                                     'Bandwidth': bandwidth,
                                     'Number': segment_number,
                                 }
+                                duration = float_or_none(segment_d, representation_ms_info['timescale'])
+                                start = float_or_none(segment_time, representation_ms_info['timescale'])
                                 representation_ms_info['fragments'].append({
                                     media_location_key: segment_url,
-                                    'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+                                    'duration': duration,
+                                    'start': availability_start_time + start,
+                                    'end': availability_start_time + start + duration,
                                 })
 
                             for num, s in enumerate(representation_ms_info['s']):
-                                segment_time = s.get('t') or segment_time
+                                segment_time = s['t'] if s.get('t') is not None else segment_time
                                 segment_d = s['d']
                                 add_segment_url()
                                 segment_number += 1
@@ -2832,15 +2843,19 @@ def add_segment_url():
                         fragments = []
                         segment_index = 0
                         timescale = representation_ms_info['timescale']
+                        start = 0
                         for s in representation_ms_info['s']:
                             duration = float_or_none(s['d'], timescale)
                             for r in range(s.get('r', 0) + 1):
                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
                                 fragments.append({
                                     location_key(segment_uri): segment_uri,
                                     'duration': duration,
+                                    'start': availability_start_time + start,
+                                    'end': availability_start_time + start + duration,
                                 })
                                 segment_index += 1
+                                start += duration
                         representation_ms_info['fragments'] = fragments
                     elif 'segment_urls' in representation_ms_info:
                         # Segment URLs with no SegmentTimeline

diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
@@ -2780,17 +2780,17 @@ def refetch_manifest(format_id, delay):
             microformats = traverse_obj(
                 prs, (..., 'microformat', 'playerMicroformatRenderer'),
                 expected_type=dict)
-            _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
-            is_live = live_status == 'is_live'
-            start_time = time.time()
+            with lock:
+                _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
+                is_live = live_status == 'is_live'
+                start_time = time.time()
 
         def mpd_feed(format_id, delay):
             """
             @returns (manifest_url, manifest_stream_number, is_live) or None
             """
             for retry in self.RetryManager(fatal=False):
-                with lock:
-                    refetch_manifest(format_id, delay)
+                refetch_manifest(format_id, delay)
 
                 f = next((f for f in formats if f['format_id'] == format_id), None)
                 if not f:
@@ -2821,6 +2821,11 @@ def _live_dash_fragments(self, video_id, format_id, live_start_time, mpd_feed, m
         begin_index = 0
         download_start_time = ctx.get('start') or time.time()
 
+        section_start = ctx.get('section_start') or 0
+        section_end = ctx.get('section_end') or math.inf
+
+        self.write_debug(f'Selected section: {section_start} -> {section_end}')
+
         lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION
         if lack_early_segments:
             self.report_warning(bug_reports_message(
@@ -2841,9 +2846,10 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
                                                or (mpd_url, stream_number, False))
             if not refresh_sequence:
                 if expire_fast and not is_live:
-                    return False, last_seq
+                    return False
                 elif old_mpd_url == mpd_url:
-                    return True, last_seq
+                    return True
+
             if manifestless_orig_fmt:
                 fmt_info = manifestless_orig_fmt
             else:
@@ -2854,14 +2860,13 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
                     fmts = None
                 if not fmts:
                     no_fragment_score += 2
-                    return False, last_seq
+                    return False
                 fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number)
             fragments = fmt_info['fragments']
             fragment_base_url = fmt_info['fragment_base_url']
             assert fragment_base_url
 
-            _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
-            return True, _last_seq
+            return True
 
         self.write_debug(f'[{video_id}] Generating fragments for format {format_id}')
         while is_live:
@@ -2881,11 +2886,19 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
                     last_segment_url = None
                     continue
             else:
-                should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15)
+                should_continue = _extract_sequence_from_mpd(True, no_fragment_score > 15)
                 no_fragment_score += 2
                 if not should_continue:
                     continue
 
+            last_fragment = fragments[-1]
+            last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
+
+            known_fragment = next(
+                (fragment for fragment in fragments if f'sq/{known_idx}' in fragment['path']), None)
+            if known_fragment and known_fragment['end'] > section_end:
+                break
+
             if known_idx > last_seq:
                 last_segment_url = None
                 continue
@@ -2895,20 +2908,36 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
             if begin_index < 0 and known_idx < 0:
                 # skip from the start when it's negative value
                 known_idx = last_seq + begin_index
+
             if lack_early_segments:
-                known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration']))
+                known_idx = max(known_idx, last_seq - int(MAX_DURATION // last_fragment['duration']))
+
+            fragment_count = last_seq - known_idx if section_end == math.inf else int(
+                (section_end - section_start) // last_fragment['duration'])
+
             try:
                 for idx in range(known_idx, last_seq):
                     # do not update sequence here or you'll get skipped some part of it
-                    should_continue, _ = _extract_sequence_from_mpd(False, False)
+                    should_continue = _extract_sequence_from_mpd(False, False)
                     if not should_continue:
                         known_idx = idx - 1
                         raise ExtractorError('breaking out of outer loop')
-                    last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx)
-                    yield {
-                        'url': last_segment_url,
-                        'fragment_count': last_seq,
-                    }
+
+                    frag_duration = last_fragment['duration']
+                    frag_start = last_fragment['start'] - (last_seq - idx) * frag_duration
+                    frag_end = frag_start + frag_duration
+
+                    if frag_start >= section_start and frag_end <= section_end:
+                        last_segment_url = urljoin(fragment_base_url, f'sq/{idx}')
+
+                        yield {
+                            'url': last_segment_url,
+                            'fragment_count': fragment_count,
+                            'duration': frag_duration,
+                            'start': frag_start,
+                            'end': frag_end,
+                        }
+
                 if known_idx == last_seq:
                     no_fragment_score += 5
                 else:
@@ -3894,6 +3923,9 @@ def build_fragments(f):
                 dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE}
                 yield dct
 
+        if live_status == 'is_live' and self.get_param('download_ranges') and not self.get_param('live_from_start'):
+            self.report_warning('For YT livestreams, --download-sections is only supported with --live-from-start')
+
         needs_live_processing = self._needs_live_processing(live_status, duration)
         skip_bad_formats = 'incomplete' not in format_types
         if self._configuration_arg('include_incomplete_formats'):

diff --git a/yt_dlp/options.py b/yt_dlp/options.py
@@ -416,7 +416,14 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
     general.add_option(
         '--live-from-start',
         action='store_true', dest='live_from_start',
-        help='Download livestreams from the start. Currently only supported for YouTube (Experimental)')
+        help=('Download livestreams from the start. Currently only supported for YouTube (Experimental). '
+              'Time ranges can be specified using --download-sections to download only a part of the stream. '
+              'Negative values are allowed for specifying a relative previous time, using the # syntax '
+              'e.g. --download-sections "#-24hours - 0" (download last 24 hours), '
+              'e.g. --download-sections "#-1h - 30m" (download from 1 hour ago until the next 30 minutes), '
+              'e.g. --download-sections "#-3days - -2days" (download from 3 days ago until 2 days ago). '
+              'It is also possible to specify an exact unix timestamp range, using the * syntax, '
+              'e.g. --download-sections "*1672531200 - 1672549200" (download between those two timestamps)'))
     general.add_option(
         '--no-live-from-start',
         action='store_false', dest='live_from_start',

diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
@@ -1192,7 +1192,7 @@ def unified_strdate(date_str, day_first=True):
         return str(upload_date)
 
 
-def unified_timestamp(date_str, day_first=True):
+def unified_timestamp(date_str, day_first=True, with_milliseconds=False):
     if not isinstance(date_str, str):
         return None
 
@@ -1218,7 +1218,7 @@ def unified_timestamp(date_str, day_first=True):
     for expression in date_formats(day_first):
         with contextlib.suppress(ValueError):
             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
-            return calendar.timegm(dt.timetuple())
+            return calendar.timegm(dt.timetuple()) + (dt.microsecond / 1e6 if with_milliseconds else 0)
 
     timetuple = email.utils.parsedate_tz(date_str)
     if timetuple:
@@ -1997,16 +1997,19 @@ def parse_duration(s):
 
     days, hours, mins, secs, ms = [None] * 5
     m = re.match(r'''(?x)
+            (?P<sign>[+-])?
             (?P<before_secs>
                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
             (?P<ms>[.:][0-9]+)?Z?$
         ''', s)
     if m:
-        days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
+        sign, days, hours, mins, secs, ms = m.group('sign', 'days', 'hours', 'mins', 'secs', 'ms')
     else:
         m = re.match(
-            r'''(?ix)(?:P?
+            r'''(?ix)(?:
+                (?P<sign>[+-])?
+                P?
                 (?:
                     [0-9]+\s*y(?:ears?)?,?\s*
                 )?
@@ -2030,17 +2033,19 @@ def parse_duration(s):
                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
                 )?Z?$''', s)
         if m:
-            days, hours, mins, secs, ms = m.groups()
+            sign, days, hours, mins, secs, ms = m.groups()
         else:
-            m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
+            m = re.match(r'(?i)(?P<sign>[+-])?(?:(?P<days>[0-9.]+)\s*(?:days?)|(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
             if m:
-                hours, mins = m.groups()
+                sign, days, hours, mins = m.groups()
             else:
                 return None
 
+    sign = -1 if sign == '-' else 1
+
     if ms:
         ms = ms.replace(':', '.')
-    return sum(float(part or 0) * mult for part, mult in (
+    return sign * sum(float(part or 0) * mult for part, mult in (
         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))