diff --git a/EchoCourse.py b/EchoCourse.py new file mode 100644 index 0000000..e9ebd12 --- /dev/null +++ b/EchoCourse.py @@ -0,0 +1,90 @@ +import json +import sys + +from selenium import webdriver +from EchoVideos import EchoVideos + + +class EchoCourse(object): + + def __init__(self, uuid, titles): + self._course_id = "" + self._uuid = uuid + self._titles = titles + self._videos = None + self._driver = None + + self._hostname = "https://view.streaming.sydney.edu.au:8443" + self._url = "{}/ess/portal/section/{}".format(self._hostname, self._uuid) + self._video_url = "{}/ess/client/api/sections/{}/section-data.json?pageSize=100".format(self._hostname, self._uuid) + + def get_videos(self): + if self._driver is None: + self._blow_up("webdriver not set yet!!!", "") + if not self._videos: + try: + course_data_json = self._get_course_data() + videos_json = course_data_json["section"]["presentations"]["pageContents"] + self._videos = EchoVideos(videos_json, self._titles, self._driver) + except KeyError as e: + self._blow_up("Unable to parse course videos from JSON (course_data)", e) + + return self._videos + + @property + def uuid(self): + return self._uuid + + @property + def hostname(self): + return self._hostname + + @property + def url(self): + return self._url + + @property + def video_url(self): + return self._video_url + + @property + def course_id(self): + if self._course_id == "": + try: + # driver = webdriver.PhantomJS() #TODO Redo this. Maybe use a singleton factory to request the lecho360 driver?s + self.driver.get(self._url) # Initialize to establish the 'anon' cookie that Echo360 sends. + self.driver.get(self._video_url) + course_data_json = self._get_course_data() + + self._course_id = course_data_json["section"]["course"]["identifier"] + except KeyError as e: + self._blow_up("Unable to parse course id (e.g. CS473) from JSON (course_data)", e) + + return self._course_id + + @property + def driver(self): + if self._driver is None: + self._blow_up("webdriver not set yet!!!", "") + return self._driver + + def _get_course_data(self): + try: + self.driver.get(self.video_url) + # self.driver.get_screenshot_as_file('./2.png') + # print(dir(self.driver)) + # print('ha') + # print(self.driver.page_source) + json_str = self.driver.find_element_by_tag_name("pre").text + + return json.loads(json_str) + except ValueError as e: + self._blow_up("Unable to retrieve JSON (course_data) from url", e) + + def set_driver(self, driver): + self._driver = driver + + def _blow_up(self, msg, e): + print(msg) + print("Exception: {}".format(str(e))) + sys.exit(1) diff --git a/EchoDownloader.py b/EchoDownloader.py new file mode 100644 index 0000000..9ef6ea8 --- /dev/null +++ b/EchoDownloader.py @@ -0,0 +1,112 @@ +import dateutil.parser +import os +import sys +import urllib.request, urllib.error, urllib.parse + +from selenium import webdriver + + +class EchoDownloader(object): + + def __init__(self, course, output_dir, date_range, username, password): + self._course = course + self._output_dir = output_dir + self._date_range = date_range + + # self._useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36" + + self._useragent = "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" + # self._driver = webdriver.PhantomJS() + + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities + dcap = dict(DesiredCapabilities.PHANTOMJS) + dcap["phantomjs.page.settings.userAgent"] = ( + "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 " + "(KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" + ) + self._driver = webdriver.PhantomJS(desired_capabilities=dcap) + + + # Monkey Patch, set the course's driver to the one from downloader + self._course.set_driver(self._driver) + + # Initialize to establish the 'anon' cookie that Echo360 sends. + print('Accessing {0}'.format(self._course.url)) + self._driver.get(self._course.url) + + + # Input username and password: + user_name = self._driver.find_element_by_id('j_username') + user_name.clear() + user_name.send_keys(username) + + user_passwd = self._driver.find_element_by_id('j_password') + user_passwd.clear() + user_passwd.send_keys(password) + + + login_btn = self._driver.find_element_by_id('login-btn') + login_btn.submit() + + self._videos = [] + + def download_all(self): + videos = self._course.get_videos().videos + filtered_videos = [video for video in videos if self._in_date_range(video.date)] + total_videos = len(filtered_videos) + + # Download the newest video first but maintain it's original index + # in case a JSON file isn't passed (and we need to label them as + # Lecture 1, 2, ...) + for i, video in reversed(list(enumerate(filtered_videos))): + # TODO Check if the lecture number is included in the JSON object. + lecture_number = self._find_pos(videos, video) + title = video.title if (video.title != "") else "Lecture {}".format(lecture_number+1) + filename = self._get_filename(self._course.course_id, video.date, title) + + print(("Downloading {} of {}: {}".format(total_videos - i, total_videos, video.url))) + print((" to {}\n".format(filename))) + self._download_as(video.url, filename) + + @property + def useragent(self): + return self._useragent + + @useragent.setter + def useragent(self, useragent): + self._useragent = useragent + + def _download_as(self, video, filename): + print(video) + print(filename) + exit() + try: + request = urllib.request.Request(video) + request.add_header('User-Agent', self._useragent) + opener = urllib.request.build_opener() + + with open(os.path.join(self._output_dir, filename), "wb") as local_file: + local_file.write(opener.open(request).read()) + + except urllib.error.HTTPError as e: + print(("HTTP Error:", e.code, video)) + except urllib.error.URLError as e: + print(("URL Error:", e.reason, video)) + + def _initialize(self, echo_course): + self._driver.get(self._course.url) + + def _get_filename(self, course, date, title): + return "{} - {} - {}.m4v".format(course, date, title) + + def _in_date_range(self, date_string): + the_date = dateutil.parser.parse(date_string).date() + return self._date_range[0] <= the_date and the_date <= self._date_range[1] + + + def _find_pos(self, videos, the_video): + for i, video in enumerate(videos): + if video.date == the_video.date: + return i + + return -1 diff --git a/EchoVideos.py b/EchoVideos.py new file mode 100644 index 0000000..af366fc --- /dev/null +++ b/EchoVideos.py @@ -0,0 +1,96 @@ +import dateutil.parser +import datetime +import operator +import sys + +class EchoVideos(object): + + def __init__(self, videos_json, titles, driver): + assert(videos_json is not None) + + self._driver = driver + self._videos = [] + for video_json in videos_json: + video_date = EchoVideo.get_date(video_json) + video_title = self._get_title(titles, video_date) + self._videos.append(EchoVideo(video_json, video_title, self._driver)) + + self._videos.sort(key=operator.attrgetter("date")) + + @property + def videos(self): + return self._videos + + def _get_title(self, titles, date): + if titles is None: + return "" + try: + for title in titles: + title_date = dateutil.parser.parse(title["date"]).date() + if date == title_date: + return title["title"].encode("ascii") + return "" + + except KeyError as e: + blow_up("Unable to parse either titles or course_data JSON", e) + + def _blow_up(self, str, e): + print(str) + print("Exception: {}".format(str(e))) + sys.exit(1) + + + +class EchoVideo(object): + + def __init__(self, video_json, title, driver): + self._title = title + self._driver = driver + + try: + video_url = "{0}".format(video_json["richMedia"]) + video_url = str(video_url) # cast back to string + # a = 'https://view.streaming.sydney.edu.au:8443/ess/echo/presentation/1a700a60-d42f-4e24-bd5d-d23d2d8dd134' + # print(video_url) + # print(a) + self._driver.get(video_url) + # self._driver.get_screenshot_as_file('./211.png') + # self._driver.get(a) + # self._driver.get_screenshot_as_file('./212.png') + # import time + # time.sleep(1) + # # self._driver.get_screenshot_as_file('./211.png') + # self._driver.get('http://getright.com/useragent.html') + # self._driver.get_screenshot_as_file('./2.png') + m3u8_url = self._driver.find_element_by_id('content-player').find_element_by_tag_name('video').get_attribute('src') + + self._url = m3u8_url + + date = dateutil.parser.parse(video_json["startTime"]).date() + self._date = date.strftime("%Y-%m-%d") + except KeyError as e: + self._blow_up("Unable to parse video data from JSON (course_data)", e) + + @property + def title(self): + return self._title + + @property + def date(self): + return self._date + + @property + def url(self): + return self._url + + @staticmethod + def get_date(video_json): + try: + return dateutil.parser.parse(video_json["startTime"]).date() + except KeyError as e: + self._blow_up("Unable to parse video date from JSON (video data)", e) + + def _blow_up(self, str, e): + print(str) + print("Exception: {}".format(str(e))) + sys.exit(1) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a7dd2a9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2017 Oscar Lai + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..57d398f --- /dev/null +++ b/Readme.md @@ -0,0 +1,131 @@ +# USYDecho360 # + +### Contents ### +1. [Main](https://github.com/soraxas/usydecho360/blob/master/Readme.md#usydecho360) +2. [Requirements](https://github.com/soraxas/usydecho360/blob/master/Readme.md#requirements) +3. [Usage](https://github.com/soraxas/usydecho360/blob/master/Readme.md#usage) +4. [Examples](https://github.com/soraxas/usydecho360/blob/master/Readme.md#examples) +5. [FAQ](https://github.com/soraxas/usydecho360/blob/master/Readme.md#faq) + +USYDecho360 is a command-line Python tool that allows you to download lecture +videos from University of Sydney's Echo360 lecture portal. All that's required +is the particular course's UUID. See the FAQ for tips on how to find it. + +Credits to [jhalstead85](https://github.com/jhalstead85/lecho360) for which this script is based upon, but this has been adopted to be usable for USYD echo360. + +# Requirements # + +### Python ### +- Dateutil >= 2.2 +- Selenium >= 2.44.0 +- ffmpy >= 0.2.2 + +``` +pip install -r requirements.txt +``` + +### NodeJS ### +- PhantomJS >= 1.9.7 + +``` +npm -g install phantomjs +``` + +### ffmpeg ### +This is required for transcoding ts file to mp4 file. See [here](https://github.com/adaptlearning/adapt_authoring/wiki/Installing-FFmpeg) for a brief instructions of installing it in different OS. + +### Operating System ### +- Linux +- OS X +- Windows + +# Usage # +``` +>>> python usydEcho360.py + +usage: usydEcho360.py [-h] --uuid COURSE_UUID [--titles TITLES_PATH] +[--output OUTPUT_PATH] [--after-date AFTER_DATE(YYYY-MM-DD)] +[--before-date BEFORE_DATE(YYYY-MM-DD)] + +Download lectures from UIUC's Echo360 portal. + +optional arguments: + -h, --help Show this help message and exit + + --uuid COURSE_UUID, Echo360 UUID for the course, which is + -u COURSE_UUID found in the URL of the course's video + lecture page. + + --output OUTPUT_PATH, Path to the desired output directory. + -o OUTPUT_PATH The output directory must exist. Otherwise + the current directory is used. + + --after-date AFTER_DATE(YYYY-MM-DD), Only download lectures newer than + -a AFTER_DATE(YYYY-MM-DD) AFTER_DATE (inclusive). Note: This may + be combined with --before-date. + + --before-date BEFORE_DATE(YYYY-MM-DD), Only download lectures older than + -b BEFORE_DATE(YYYY-MM-DD) BEFORE_DATE (inclusive). Note: This may + be combined with --after-date. + + --unikey UNIKEY, Your unikey for your University of Sydney + -k UNIKEY elearning account + + --password PASSWORD, Your password for your University of Sydney + -p PASSWORD elearning account +``` +# Examples # + +### Download all available lectures ### +``` +>>> python usydEcho360.py \ + --uuid "041698d6-f43a-4b09-a39a-b90475a63530" \ + --ouput "~/Lectures" +``` + +### Download all lectures on or before a date ### +``` +>>> python usydEcho360.py \ + --uuid "041698d6-f43a-4b09-a39a-b90475a63530" \ + --ouput "~/Lectures" \ + --before-date "2014-10-14" +``` + +### Download all lectures on or after a date ### +``` +python usydEcho360.py \ + --uuid "041698d6-f43a-4b09-a39a-b90475a63530" \ + --ouput "~/Lectures" \ + --after-date "2014-10-14" +``` + +### Download all lectures in a given date range (inclusive) ### +``` +>>> python usydEcho360.py \ + --uuid "041698d6-f43a-4b09-a39a-b90475a63530" \ + --ouput "~/Lectures" \ + --after-date "2014-08-26" \ + --before-date "2014-10-14" +``` + +# FAQ # + +### How do I retrieve the UUID for a course? ### +This is the most involved part (unless you have access to a titles file). What you need is the URL to the course's main Echo360 lecture page. It's the main page that lists all the recorded lectures and gives you the option to stream them or download them individually. + +![CIVL4093 Main Echo360 Lecture Page](https://imgur.com/a/xkJT0) + + +You can usually find this link on your course's main webpage. If your course webpage only links directly to videos, then you should be able to navigate back to the main portale via that link. + +The URL for the 2017 semester 2 of CIVL4903 looks like + +``` +https://view.streaming.sydney.edu.au:8443/ess/portal/section/041698d6-f43a-4b09-a39a-b90475a63530 +``` + +which you can verify is correct in the above screenshot. The UUID is the last element of the URL. So in the above example it's, + +``` +041698d6-f43a-4b09-a39a-b90475a63530 +``` diff --git a/hls_downloader.py b/hls_downloader.py new file mode 100644 index 0000000..482ca4f --- /dev/null +++ b/hls_downloader.py @@ -0,0 +1,122 @@ +#coding: utf-8 + +from gevent import monkey +monkey.patch_all() +from gevent.pool import Pool +import gevent +import requests +from urllib.parse import urljoin +import os +import time + +class Downloader: + def __init__(self, pool_size, retry=3): + self.pool = Pool(pool_size) + self.session = self._get_http_session(pool_size, pool_size, retry) + self.retry = retry + self.dir = '' + self.succed = {} + self.failed = [] + self.ts_total = 0 + + def _get_http_session(self, pool_connections, pool_maxsize, max_retries): + session = requests.Session() + adapter = requests.adapters.HTTPAdapter(pool_connections=pool_connections, pool_maxsize=pool_maxsize, max_retries=max_retries) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + + def run(self, m3u8_url, dir=''): + self.MAIN_FILE_NAME = None + self.dir = dir + if self.dir and not os.path.isdir(self.dir): + os.makedirs(self.dir) + r = self.session.get(m3u8_url, timeout=10) + if r.ok: + body = r.content + if body: + ts_list = [urljoin(m3u8_url, n.strip()) for n in body.decode().split('\n') if n and not n.startswith("#")] + if len(ts_list) == 1: + file_name = ts_list[0].split('/')[-1].split('?')[0] + chunk_list_url = "{0}/{1}".format(m3u8_url[:m3u8_url.rfind('/')], file_name) + print("Getting chunklist at: {0}".format(chunk_list_url)) + r = self.session.get(chunk_list_url, timeout=20) + if r.ok: + body = r.content + ts_list = [urljoin(m3u8_url, n.strip()) for n in body.decode().split('\n') if n and not n.startswith("#")] + # re-retrieve to get all ts file list + + ts_list = zip(ts_list, [n for n in range(len(ts_list))]) + ts_list = list(ts_list) + + if ts_list: + self.ts_total = len(ts_list) + print('ts total: {0}'.format(self.ts_total)) + g1 = gevent.spawn(self._join_file) + self._download(ts_list) + g1.join() + else: + print(r.status_code) + print(self.MAIN_FILE_NAME) + infile_name = os.path.join(self.dir, self.MAIN_FILE_NAME.split('.')[0]+'_all.'+self.MAIN_FILE_NAME.split('.')[-1]) + outfile_name = infile_name.split('.')[0] + '.mp4' + print('Converting "{0}" to "{1}"'.format(infile_name, outfile_name)) + from ffmpy import FFmpeg + ff = FFmpeg( + inputs={infile_name: None}, + outputs={outfile_name: ['-c','copy']} + ) + ff.run() + + def _download(self, ts_list): + self.pool.map(self._worker, ts_list) + if self.failed: + ts_list = self.failed + self.failed = [] + self._download(ts_list) + + def _worker(self, ts_tuple): + url = ts_tuple[0] + index = ts_tuple[1] + retry = self.retry + while retry: + try: + r = self.session.get(url, timeout=20) + if r.ok: + file_name = url.split('/')[-1].split('?')[0] + print('Downloading: {0}'.format(file_name)) + with open(os.path.join(self.dir, file_name), 'wb') as f: + f.write(r.content) + self.succed[index] = file_name + return + except: + retry -= 1 + print('[FAIL]%s' % url) + self.failed.append((url, index)) + + def _join_file(self): + index = 0 + outfile = '' + while index < self.ts_total: + file_name = self.succed.get(index, '') + if file_name: + if self.MAIN_FILE_NAME is None: + self.MAIN_FILE_NAME = file_name + infile = open(os.path.join(self.dir, file_name), 'rb') + if not outfile: + outfile = open(os.path.join(self.dir, file_name.split('.')[0]+'_all.'+file_name.split('.')[-1]), 'wb') + outfile.write(infile.read()) + infile.close() + os.remove(os.path.join(self.dir, file_name)) + index += 1 + else: + time.sleep(1) + if outfile: + outfile.close() + +if __name__ == '__main__': + downloader = Downloader(50) + # downloader.run('http://m3u8.test.com/test.m3u8', '/home/soraxas/Videos/') + #downloader.run('http://delivery.streaming.sydney.edu.au:1935/echo/_definst_/1731/3/ba3eddeb-f00e-405a-98bb-1ed19157ffa8/mp4:audio-vga-streamable.m4v/playlist.m3u8', '/home/soraxas/Videos/') + downloader.run('http://delivery.streaming.sydney.edu.au:1935/echo/_definst_/1741/3/1a700a60-d42f-4e24-bd5d-d23d2d8dd134/mp4:audio-vga-streamable.m4v/playlist.m3u8', '/home/soraxas/Videos/') + print('DONE') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7debacf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +python-dateutil>=2.2 +selenium>=2.44.0 +ffmpy>=0.2.2 diff --git a/usydEcho360.py b/usydEcho360.py new file mode 100644 index 0000000..e4f2d46 --- /dev/null +++ b/usydEcho360.py @@ -0,0 +1,106 @@ +import argparse +import json +import os +import sys + +from datetime import datetime +from EchoCourse import EchoCourse +from EchoDownloader import EchoDownloader + + +_DEFAULT_BEFORE_DATE = datetime(2900, 1, 1).date() +_DEFAULT_AFTER_DATE = datetime(1100, 1, 1).date() + +def try_parse_date(date_string, fmt): + try: + return datetime.strptime(date_string, fmt).date() + except: + print("Error parsing date input:", sys.exc_info()) + sys.exit(1) + +def handle_args(): + parser = argparse.ArgumentParser(description="Download lectures from UIUC's Echo360 portal.") + parser.add_argument("--uuid", "-u", + required=True, + help="Echo360 UUID for the course, which is found in \ + the URL of the course's video lecture page (e.g. \ + '115f3def-7371-4e98-b72f-6efe53771b2a' in \ + http://recordings.engineering.illinois.edu/ess/portal/section/115f3def-7371-4e98-b72f-6efe53771b2a)", + metavar="COURSE_UUID") + parser.add_argument("--titles", "-f", + help="Path to JSON file containing date to title \ + mapping. See Readme.md for info on the \ + required format", + metavar="TITLES_PATH") + parser.add_argument("--output", "-o", + help="Path to the desired output directory The output \ + directory must exist. Otherwise the current \ + directory is used.", + metavar="OUTPUT_PATH") + parser.add_argument("--after-date", "-a", + dest="after_date", + help="Only download lectures newer than AFTER_DATE \ + (inclusive). Note: this may be combined with \ + --before-date.", + metavar="AFTER_DATE(YYYY-MM-DD)") + parser.add_argument("--before-date", "-b", + dest="before_date", + help="Only download lectures older than BEFORE_DATE \ + (inclusive). Note: this may be combined with \ + --after-date", + metavar="BEFORE_DATE(YYYY-MM-DD)") + parser.add_argument("--unikey", "-k", + dest="unikey", + help="Your unikey for your University of \ + Sydney elearning account", + metavar="UNIKEY") + parser.add_argument("--password", "-p", + dest="password", + help="Your password for your University of \ + Sydney elearning account", + metavar="PASSWORD") + + args = vars(parser.parse_args()) + course_uuid = args["uuid"] + + titles_path = os.path.expanduser(args["titles"]) if args["titles"] is not None else "" + titles_path = titles_path if os.path.isfile(titles_path) else "" + + output_path = os.path.expanduser(args["output"]) if args["output"] is not None else "" + output_path = output_path if os.path.isdir(output_path) else "" + + after_date = try_parse_date(args["after_date"], "%Y-%m-%d") if args["after_date"] else _DEFAULT_AFTER_DATE + before_date = try_parse_date(args["before_date"], "%Y-%m-%d") if args["before_date"] else _DEFAULT_BEFORE_DATE + + username = args["unikey"] + password = args["password"] + + if username is None: + username = input('Unikey: ') + if password is None: + import getpass + password = getpass.getpass('Passowrd for {0} : '.format(username)) + + return (course_uuid, titles_path, output_path, after_date, before_date, username, password) + +def main(): + course_uuid, titles_path, output_path, after_date, before_date, username, password = handle_args() + + titles = None + if titles_path != "": + with open(titles_path, "r") as titles_json: + data = json.load(titles_json) + titles = data["titles"] if "titles" in data else None + + course = EchoCourse(course_uuid, titles) + downloader = EchoDownloader(course, output_path, date_range=(after_date, before_date), username=username, password=password) + downloader.download_all() + +def _blow_up(self, str, e): + print(str) + print("Exception: {}".format(str(e))) + sys.exit(1) + + +if __name__ == '__main__': + main()