Skip to content

Commit

Permalink
Yt live from start range (#2)
Browse files Browse the repository at this point in the history
* [utils] Add hackish 'now' support for --download-sections

* [utils] Add microseconds to unified_timestamp

* [common] Extract start and end keys for Dash fragments

* [utils] Allow using local timezone for 'now' timestamps

* Use local timezone for download sections

* Add fixme in modified parse_chapters function

A range like '*(now-1hour)-(now-30minutes)' doesn't work

* [youtube] Support --download-sections for YT Livestream from start

* Create last_segment_url only if necessary

* Improve parse_chapters comments

* Fix linter

* [extractor/iq] Set more language codes (yt-dlp#6476)

Authored by: D0LLYNH0

* [extractor/opencast] Add ltitools to `_VALID_URL` (yt-dlp#6371)

Authored by: C0D3D3V

* [downloader/curl] Fix progress reporting

Bug in 8c53322
Closes yt-dlp#6490

* [extractor/youtube] Bypass throttling for `-f17`

and related cleanup

Thanks @AudricV for the finding

* [extractor/twitch] Fix `is_live` (yt-dlp#6500)

Closes yt-dlp#6494
Authored by: elyse0

* [extractor/cbc:gem] Update `_VALID_URL` (yt-dlp#6499)

Authored by: makeworld-the-better-one
Closes yt-dlp#6395

* Support loading info.json with a list at it's root

* [extractor/hidive] Fix login

Fixes yt-dlp#6493 (comment)

* [extractor/opencast] Fix format bug (yt-dlp#6512)

Authored by: C0D3D3V

* [extractor/rokfin] Re-construct manifest url (yt-dlp#6507)

Authored by: vampirefrog

* [extractor/youtube] Add client name to `format_note` when `-v` (yt-dlp#6254)

Authored by: Lesmiscore, pukkandan

* [extractor/youtube] Add extractor-arg `include_duplicate_formats`

* [extractor/youtube] Construct fragment list lazily

Building fragment list for all formats take significant time for large videos

* Support negative durations

* Revert "[utils] Allow using local timezone for 'now' timestamps"

This reverts commit 1799a6a.

* Add fragment count

* Fix unified_timestamp

* Remove tz_aware date code

* Add debug for selected section

* Add initial documentation

* Fix linter

* Fix linter

* Allow days in parse_duration

* Improve option documentation

* Add some documentation

* Lock less agressively

This gives a speed performance of about 30%

* Fix return values of _extract_sequence_from_mpd

* Always compute last_seq

* Support for epoch timestamps

* Update options docs

* Restore README.md

I think this is auto-generated by some script

* Add warning about --download-sections without --live-from-start

* Fix bug after merge

* Update yt_dlp/options.py

* Cleanup

---------

Co-authored-by: Elyse <26639800+elyse0@users.noreply.github.com>
Co-authored-by: Sophire <115919609+sophie0x@users.noreply.github.com>
Co-authored-by: D0LLYNH0 <67797325+D0LLYNH0@users.noreply.github.com>
Co-authored-by: Daniel Vogt <daniel-vogt@mail.de>
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
Co-authored-by: makeworld <25111343+makeworld-the-better-one@users.noreply.github.com>
Co-authored-by: Daniel Vogt <c0d3d3v@mag-keinen-spam.de>
Co-authored-by: vampirefrog <vampirefrog@users.noreply.github.com>
Co-authored-by: Lesmiscore <nao20010128@gmail.com>
Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
Co-authored-by: bashonly <bashonly@bashonly.com>
  • Loading branch information
12 people committed Jul 30, 2023
1 parent 6014355 commit 610b834
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 34 deletions.
5 changes: 5 additions & 0 deletions test/test_utils.py
Expand Up @@ -413,10 +413,15 @@ def test_unified_timestamps(self):
self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
self.assertEqual(unified_timestamp('2022-10-13T02:37:47.831Z'), 1665628667)

self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1)
self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86)
self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78)
self.assertEqual(unified_timestamp('2023-03-09T18:01:33.646Z', with_milliseconds=True), 1678384893.646)
# ISO8601 spec says that if no timezone is specified, we should use local timezone;
# but yt-dlp uses UTC to keep things consistent
self.assertEqual(unified_timestamp('2023-03-11T06:48:34.008'), 1678517314)

def test_determine_ext(self):
self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
Expand Down
9 changes: 7 additions & 2 deletions yt_dlp/YoutubeDL.py
Expand Up @@ -27,7 +27,12 @@
from .compat import functools, urllib # isort: split
from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req
from .cookies import LenientSimpleCookie, load_cookies
from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
from .downloader import (
DashSegmentsFD,
FFmpegFD,
get_suitable_downloader,
shorten_protocol_name,
)
from .downloader.rtmp import rtmpdump_version
from .extractor import gen_extractor_classes, get_info_extractor
from .extractor.common import UnsupportedURLIE
Expand Down Expand Up @@ -3289,7 +3294,7 @@ def existing_video_file(*filepaths):
fd, success = None, True
if info_dict.get('protocol') or info_dict.get('url'):
fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and (
if fd not in [FFmpegFD, DashSegmentsFD] and 'no-direct-merge' not in self.params['compat_opts'] and (
info_dict.get('section_start') or info_dict.get('section_end')):
msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
else 'You have requested downloading the video partially, but ffmpeg is not installed')
Expand Down
11 changes: 9 additions & 2 deletions yt_dlp/__init__.py
Expand Up @@ -13,6 +13,7 @@
import os
import re
import sys
import time
import traceback

from .compat import compat_shlex_quote
Expand Down Expand Up @@ -328,12 +329,13 @@ def parse_chapters(name, value, advanced=False):
(?P<end_sign>-?)(?P<end>[^-]+)
)?'''

current_time = time.time()
chapters, ranges, from_url = [], [], False
for regex in value or []:
if advanced and regex == '*from-url':
from_url = True
continue
elif not regex.startswith('*'):
elif not regex.startswith('*') and not regex.startswith('#'):
try:
chapters.append(re.compile(regex))
except re.error as err:
Expand All @@ -350,11 +352,16 @@ def parse_chapters(name, value, advanced=False):
err = 'Must be of the form "*start-end"'
elif not advanced and any(signs):
err = 'Negative timestamps are not allowed'
else:
elif regex.startswith('*'):
dur[0] *= -1 if signs[0] else 1
dur[1] *= -1 if signs[1] else 1
if dur[1] == float('-inf'):
err = '"-inf" is not a valid end'
elif regex.startswith('#'):
dur[0] = dur[0] * (-1 if signs[0] else 1) + current_time
dur[1] = dur[1] * (-1 if signs[1] else 1) + current_time
if dur[1] == float('-inf'):
err = '"-inf" is not a valid end'
if err:
raise ValueError(f'invalid {name} time range "{regex}". {err}')
ranges.append(dur)
Expand Down
2 changes: 2 additions & 0 deletions yt_dlp/downloader/dash.py
Expand Up @@ -33,6 +33,8 @@ def real_download(self, filename, info_dict):
'filename': fmt.get('filepath') or filename,
'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'),
'total_frags': fragment_count,
'section_start': info_dict.get('section_start'),
'section_end': info_dict.get('section_end'),
}

if real_downloader:
Expand Down
21 changes: 18 additions & 3 deletions yt_dlp/extractor/common.py
Expand Up @@ -2592,7 +2592,7 @@ def extract_common(source):
r = int(s.get('r', 0))
ms_info['total_number'] += 1 + r
ms_info['s'].append({
't': int(s.get('t', 0)),
't': int_or_none(s.get('t')),
# @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
'd': int(s.attrib['d']),
'r': r,
Expand Down Expand Up @@ -2634,9 +2634,16 @@ def extract_Initialization(source):
return ms_info

mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
availability_start_time = unified_timestamp(
mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0
formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
for period in mpd_doc.findall(_add_ns('Period')):
# segmentIngestTime is completely out of spec, but YT Livestream do this
segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime')
if segment_ingest_time:
availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True)

period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
Expand Down Expand Up @@ -2810,13 +2817,17 @@ def add_segment_url():
'Bandwidth': bandwidth,
'Number': segment_number,
}
duration = float_or_none(segment_d, representation_ms_info['timescale'])
start = float_or_none(segment_time, representation_ms_info['timescale'])
representation_ms_info['fragments'].append({
media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
'duration': duration,
'start': availability_start_time + start,
'end': availability_start_time + start + duration,
})

for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time
segment_time = s['t'] if s.get('t') is not None else segment_time
segment_d = s['d']
add_segment_url()
segment_number += 1
Expand All @@ -2832,15 +2843,19 @@ def add_segment_url():
fragments = []
segment_index = 0
timescale = representation_ms_info['timescale']
start = 0
for s in representation_ms_info['s']:
duration = float_or_none(s['d'], timescale)
for r in range(s.get('r', 0) + 1):
segment_uri = representation_ms_info['segment_urls'][segment_index]
fragments.append({
location_key(segment_uri): segment_uri,
'duration': duration,
'start': availability_start_time + start,
'end': availability_start_time + start + duration,
})
segment_index += 1
start += duration
representation_ms_info['fragments'] = fragments
elif 'segment_urls' in representation_ms_info:
# Segment URLs with no SegmentTimeline
Expand Down
68 changes: 50 additions & 18 deletions yt_dlp/extractor/youtube.py
Expand Up @@ -2780,17 +2780,17 @@ def refetch_manifest(format_id, delay):
microformats = traverse_obj(
prs, (..., 'microformat', 'playerMicroformatRenderer'),
expected_type=dict)
_, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
is_live = live_status == 'is_live'
start_time = time.time()
with lock:
_, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
is_live = live_status == 'is_live'
start_time = time.time()

def mpd_feed(format_id, delay):
"""
@returns (manifest_url, manifest_stream_number, is_live) or None
"""
for retry in self.RetryManager(fatal=False):
with lock:
refetch_manifest(format_id, delay)
refetch_manifest(format_id, delay)

f = next((f for f in formats if f['format_id'] == format_id), None)
if not f:
Expand Down Expand Up @@ -2821,6 +2821,11 @@ def _live_dash_fragments(self, video_id, format_id, live_start_time, mpd_feed, m
begin_index = 0
download_start_time = ctx.get('start') or time.time()

section_start = ctx.get('section_start') or 0
section_end = ctx.get('section_end') or math.inf

self.write_debug(f'Selected section: {section_start} -> {section_end}')

lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION
if lack_early_segments:
self.report_warning(bug_reports_message(
Expand All @@ -2841,9 +2846,10 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
or (mpd_url, stream_number, False))
if not refresh_sequence:
if expire_fast and not is_live:
return False, last_seq
return False
elif old_mpd_url == mpd_url:
return True, last_seq
return True

if manifestless_orig_fmt:
fmt_info = manifestless_orig_fmt
else:
Expand All @@ -2854,14 +2860,13 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
fmts = None
if not fmts:
no_fragment_score += 2
return False, last_seq
return False
fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number)
fragments = fmt_info['fragments']
fragment_base_url = fmt_info['fragment_base_url']
assert fragment_base_url

_last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
return True, _last_seq
return True

self.write_debug(f'[{video_id}] Generating fragments for format {format_id}')
while is_live:
Expand All @@ -2881,11 +2886,19 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
last_segment_url = None
continue
else:
should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15)
should_continue = _extract_sequence_from_mpd(True, no_fragment_score > 15)
no_fragment_score += 2
if not should_continue:
continue

last_fragment = fragments[-1]
last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))

known_fragment = next(
(fragment for fragment in fragments if f'sq/{known_idx}' in fragment['path']), None)
if known_fragment and known_fragment['end'] > section_end:
break

if known_idx > last_seq:
last_segment_url = None
continue
Expand All @@ -2895,20 +2908,36 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
if begin_index < 0 and known_idx < 0:
# skip from the start when it's negative value
known_idx = last_seq + begin_index

if lack_early_segments:
known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration']))
known_idx = max(known_idx, last_seq - int(MAX_DURATION // last_fragment['duration']))

fragment_count = last_seq - known_idx if section_end == math.inf else int(
(section_end - section_start) // last_fragment['duration'])

try:
for idx in range(known_idx, last_seq):
# do not update sequence here or you'll get skipped some part of it
should_continue, _ = _extract_sequence_from_mpd(False, False)
should_continue = _extract_sequence_from_mpd(False, False)
if not should_continue:
known_idx = idx - 1
raise ExtractorError('breaking out of outer loop')
last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx)
yield {
'url': last_segment_url,
'fragment_count': last_seq,
}

frag_duration = last_fragment['duration']
frag_start = last_fragment['start'] - (last_seq - idx) * frag_duration
frag_end = frag_start + frag_duration

if frag_start >= section_start and frag_end <= section_end:
last_segment_url = urljoin(fragment_base_url, f'sq/{idx}')

yield {
'url': last_segment_url,
'fragment_count': fragment_count,
'duration': frag_duration,
'start': frag_start,
'end': frag_end,
}

if known_idx == last_seq:
no_fragment_score += 5
else:
Expand Down Expand Up @@ -3894,6 +3923,9 @@ def build_fragments(f):
dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE}
yield dct

if live_status == 'is_live' and self.get_param('download_ranges') and not self.get_param('live_from_start'):
self.report_warning('For YT livestreams, --download-sections is only supported with --live-from-start')

needs_live_processing = self._needs_live_processing(live_status, duration)
skip_bad_formats = 'incomplete' not in format_types
if self._configuration_arg('include_incomplete_formats'):
Expand Down
9 changes: 8 additions & 1 deletion yt_dlp/options.py
Expand Up @@ -416,7 +416,14 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
general.add_option(
'--live-from-start',
action='store_true', dest='live_from_start',
help='Download livestreams from the start. Currently only supported for YouTube (Experimental)')
help=('Download livestreams from the start. Currently only supported for YouTube (Experimental). '
'Time ranges can be specified using --download-sections to download only a part of the stream. '
'Negative values are allowed for specifying a relative previous time, using the # syntax '
'e.g. --download-sections "#-24hours - 0" (download last 24 hours), '
'e.g. --download-sections "#-1h - 30m" (download from 1 hour ago until the next 30 minutes), '
'e.g. --download-sections "#-3days - -2days" (download from 3 days ago until 2 days ago). '
'It is also possible to specify an exact unix timestamp range, using the * syntax, '
'e.g. --download-sections "*1672531200 - 1672549200" (download between those two timestamps)'))
general.add_option(
'--no-live-from-start',
action='store_false', dest='live_from_start',
Expand Down
21 changes: 13 additions & 8 deletions yt_dlp/utils/_utils.py
Expand Up @@ -1192,7 +1192,7 @@ def unified_strdate(date_str, day_first=True):
return str(upload_date)


def unified_timestamp(date_str, day_first=True):
def unified_timestamp(date_str, day_first=True, with_milliseconds=False):
if not isinstance(date_str, str):
return None

Expand All @@ -1218,7 +1218,7 @@ def unified_timestamp(date_str, day_first=True):
for expression in date_formats(day_first):
with contextlib.suppress(ValueError):
dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
return calendar.timegm(dt.timetuple())
return calendar.timegm(dt.timetuple()) + (dt.microsecond / 1e6 if with_milliseconds else 0)

timetuple = email.utils.parsedate_tz(date_str)
if timetuple:
Expand Down Expand Up @@ -1997,16 +1997,19 @@ def parse_duration(s):

days, hours, mins, secs, ms = [None] * 5
m = re.match(r'''(?x)
(?P<sign>[+-])?
(?P<before_secs>
(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
(?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
(?P<ms>[.:][0-9]+)?Z?$
''', s)
if m:
days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
sign, days, hours, mins, secs, ms = m.group('sign', 'days', 'hours', 'mins', 'secs', 'ms')
else:
m = re.match(
r'''(?ix)(?:P?
r'''(?ix)(?:
(?P<sign>[+-])?
P?
(?:
[0-9]+\s*y(?:ears?)?,?\s*
)?
Expand All @@ -2030,17 +2033,19 @@ def parse_duration(s):
(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
)?Z?$''', s)
if m:
days, hours, mins, secs, ms = m.groups()
sign, days, hours, mins, secs, ms = m.groups()
else:
m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
m = re.match(r'(?i)(?P<sign>[+-])?(?:(?P<days>[0-9.]+)\s*(?:days?)|(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
if m:
hours, mins = m.groups()
sign, days, hours, mins = m.groups()
else:
return None

sign = -1 if sign == '-' else 1

if ms:
ms = ms.replace(':', '.')
return sum(float(part or 0) * mult for part, mult in (
return sign * sum(float(part or 0) * mult for part, mult in (
(days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))


Expand Down

0 comments on commit 610b834

Please sign in to comment.